diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..71dbdabf3dd9f49d690f3ec24300a1a5d8864a69
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,34 @@
+version: 2.1
+
+# this allows you to use CircleCI's dynamic configuration feature
+setup: true
+
+# the path-filtering orb is required to continue a pipeline based on
+# the path of an updated fileset
+orbs:
+  path-filtering: circleci/path-filtering@0.1.2
+
+workflows:
+  # the always-run workflow is always triggered, regardless of the pipeline parameters.
+  always-run:
+    jobs:
+      # the path-filtering/filter job determines which pipeline
+      # parameters to update.
+      - path-filtering/filter:
+          name: check-updated-files
+          # 3-column, whitespace-delimited mapping. One mapping per
+          # line:
+          # <regex path-to-test> <parameter-to-set> <value-of-pipeline-parameter>
+          mapping: |
+            mmaction/.* lint_only false
+            requirements/.* lint_only false
+            tests/.* lint_only false
+            tools/.* lint_only false
+            configs/.* lint_only false
+            .circleci/.* lint_only false
+          base-revision: dev-1.x
+          # this is the path of the configuration we should trigger once
+          # path filtering and pipeline parameter value updates are
+          # complete. In this case, we are using the parent dynamic
+          # configuration itself.
+          config-path: .circleci/test.yml
diff --git a/.circleci/docker/Dockerfile b/.circleci/docker/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..53c009c9ec39b9884bec484e2af2d9302faed008
--- /dev/null
+++ b/.circleci/docker/Dockerfile
@@ -0,0 +1,11 @@
+ARG PYTORCH="1.8.1"
+ARG CUDA="11.1"
+ARG CUDNN="8"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+# To fix GPG key error when running apt-get update
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+RUN apt-get update && apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx
diff --git a/.circleci/test.yml b/.circleci/test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..933efc31bd806246f92052c8a796bb79e30bcd56
--- /dev/null
+++ b/.circleci/test.yml
@@ -0,0 +1,211 @@
+version: 2.1
+
+# the default pipeline parameters, which will be updated according to
+# the results of the path-filtering orb
+parameters:
+  lint_only:
+    type: boolean
+    default: true
+
+jobs:
+  lint:
+    docker:
+      - image: cimg/python:3.7.4
+    steps:
+      - checkout
+      - run:
+          name: Install pre-commit hook
+          command: |
+            pip install pre-commit
+            pre-commit install
+      - run:
+          name: Linting
+          command: pre-commit run --all-files
+      - run:
+          name: Check docstring coverage
+          command: |
+            pip install interrogate
+            interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-regex "__repr__" --fail-under 50 mmaction
+  build_cpu:
+    parameters:
+      # The python version must match available image tags in
+      # https://circleci.com/developer/images/image/cimg/python
+      python:
+        type: string
+      torch:
+        type: string
+      torchvision:
+        type: string
+    docker:
+      - image: cimg/python:<< parameters.python >>
+    resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Install Libraries
+          command: |
+            sudo apt-get update
+            sudo apt-get upgrade
+            sudo apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libturbojpeg pkg-config
+            sudo apt-get install -y libavdevice-dev libavfilter-dev libopus-dev libvpx-dev libsrtp2-dev libsndfile1
+      - run:
+          name: Configure Python & pip
+          command: |
+            pip install --upgrade pip
+            pip install wheel
+      - run:
+          name: Install PyTorch
+          command: |
+            python -V
+            pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - run:
+          name: Install mmaction dependencies
+          command: |
+            pip install git+ssh://git@github.com/open-mmlab/mmengine.git@main
+            pip install -U openmim
+            mim install 'mmcv >= 2.0.0'
+            pip install git+https://git@github.com/open-mmlab/mmdetection.git@dev-3.x
+            pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+            pip install git+https://github.com/open-mmlab/mmpretrain.git@dev
+            pip install git+https://github.com/open-mmlab/mmpose.git@dev-1.x
+            pip install -r requirements.txt
+      - run:
+          name: Install timm
+          command: |
+            pip install timm
+      - run:
+          name: Install transformers
+          command: |
+            pip install transformers
+      - when:
+          condition:
+            equal: [ "0.10.0", << parameters.torchvision >> ]
+          steps:
+            - run: python -m pip install pytorchvideo
+      - run:
+          name: Build and install
+          command: |
+            pip install -e .
+      - run:
+          name: Run unittests
+          command: |
+            coverage run --branch --source mmaction -m pytest tests/
+            coverage xml
+            coverage report -m
+  build_cuda:
+    parameters:
+      torch:
+        type: string
+      cuda:
+        type: enum
+        enum: ["11.1"]
+      cudnn:
+        type: integer
+        default: 8
+    machine:
+      image: ubuntu-2004-cuda-11.4:202110-01
+      # docker_layer_caching: true
+    resource_class: gpu.nvidia.small
+    steps:
+      - checkout
+      - run:
+          name: Build Docker image
+          command: |
+            docker build .circleci/docker -t mmaction:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >>
+            docker run --gpus all -t -d -v /home/circleci/project:/mmaction -w /mmaction --name mmaction mmaction:gpu
+            docker exec mmaction apt-get update
+            docker exec mmaction apt-get upgrade -y
+            docker exec mmaction apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libturbojpeg pkg-config
+            docker exec mmaction apt-get install -y libavdevice-dev libavfilter-dev libopus-dev libvpx-dev libsrtp2-dev libsndfile1
+      - run:
+          name: Install PytorchVideo and timm
+          command: |
+            docker exec mmaction pip install timm
+            docker exec mmaction python -m pip install pytorchvideo
+      - run:
+          name: Install transformers
+          command: |
+            docker exec mmaction pip install transformers
+      - run:
+          name: Install mmaction dependencies
+          command: |
+            docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmengine.git@main
+            docker exec mmaction pip install -U openmim
+            docker exec mmaction mim install 'mmcv >= 2.0.0'
+            docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmdetection.git@dev-3.x
+            docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmpose.git@dev-1.x
+            docker exec mmaction pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+            docker exec mmaction pip install git+https://github.com/open-mmlab/mmpretrain.git@dev
+            docker exec mmaction pip install -r requirements.txt
+      - run:
+          name: Build and install
+          command: |
+            docker exec mmaction pip install -e .
+      - run:
+          name: Run unittests
+          command: |
+            docker exec mmaction pytest tests/
+workflows:
+  pr_stage_lint:
+    jobs:
+      - lint:
+          name: lint
+          filters:
+            branches:
+              ignore:
+                - dev-1.x
+                - main
+  pr_stage_test:
+    when:
+      not:
+        << pipeline.parameters.lint_only >>
+    jobs:
+      - lint:
+          name: lint
+          filters:
+            branches:
+              ignore:
+                - dev-1.x
+                - main
+      - build_cpu:
+          name: minimum_version_cpu
+          torch: 1.8.1
+          torchvision: 0.9.1
+          python: 3.7.4
+          requires:
+            - lint
+      - build_cpu:
+          name: maximum_version_cpu
+          torch: 1.13.0
+          torchvision: 0.14.0
+          python: 3.9.0
+          requires:
+            - minimum_version_cpu
+      - hold:
+          type: approval
+          requires:
+            - maximum_version_cpu
+      - build_cuda:
+          name: mainstream_version_gpu
+          torch: 1.8.1
+          # Use double quotation mark to explicitly specify its type
+          # as string instead of number
+          cuda: "11.1"
+          requires:
+            - hold
+  merge_stage_test:
+    when:
+      not:
+        << pipeline.parameters.lint_only >>
+    jobs:
+      - build_cuda:
+          name: minimum_version_gpu
+          torch: 1.8.1
+          # Use double quotation mark to explicitly specify its type
+          # as string instead of number
+          cuda: "11.1"
+          filters:
+            branches:
+              only:
+                - dev-1.x
+                - main
diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..9108caa13b9cfaaa13799bc7643be68c703d6fab 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+demo/demo.mp4 filter=lfs diff=lfs merge=lfs -text
+resources/data_pipeline.png filter=lfs diff=lfs merge=lfs -text
+resources/miaomiao_qrcode.jpg filter=lfs diff=lfs merge=lfs -text
+resources/mmaction2_overview.gif filter=lfs diff=lfs merge=lfs -text
+resources/qq_group_qrcode.jpg filter=lfs diff=lfs merge=lfs -text
+resources/spatio-temporal-det.gif filter=lfs diff=lfs merge=lfs -text
+resources/zhihu_qrcode.jpg filter=lfs diff=lfs merge=lfs -text
+tests/data/rawvideo_dataset/part_1.mp4 filter=lfs diff=lfs merge=lfs -text
+tests/data/test.avi filter=lfs diff=lfs merge=lfs -text
+tests/data/test.mp4 filter=lfs diff=lfs merge=lfs -text
+tests/data/test.wav filter=lfs diff=lfs merge=lfs -text
+tools/data/skeleton/S001C001P001R001A001_rgb.avi filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b05b2f768cd121cc33150cbdebf477666bb207a9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,151 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+**/*.pyc
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Auto generate documentation
+docs/*/_build/
+docs/*/model_zoo/
+docs/*/dataset_zoo/
+docs/*/_model_zoo.rst
+docs/*/modelzoo_statistics.md
+docs/*/datasetzoo_statistics.md
+docs/*/projectzoo.md
+docs/*/papers/
+docs/*/api/generated/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# custom
+/data
+.vscode
+.idea
+*.pkl
+*.pkl.json
+*.log.json
+benchlist.txt
+work_dirs/
+/projects/*/work_dirs
+/projects/*/data
+.DS_Store
+
+# Pytorch
+*.pth
+
+# Profile
+*.prof
+
+# lmdb
+*.mdb
+
+# unignore some data file in tests/data
+!tests/data/**/*.pkl
+!tests/data/**/*.pkl.json
+!tests/data/**/*.log.json
+!tests/data/**/*.pth
+
+# avoid soft links created by MIM
+mmaction/tools/*
+
+*.ipynb
+
+# unignore ipython notebook files in demo
+!demo/*.ipynb
+!projects/stad_tutorial/*.ipynb
+mmaction/.mim
diff --git a/.owners.yml b/.owners.yml
new file mode 100644
index 0000000000000000000000000000000000000000..626aaab1890e4f3e5183be2cfa2fdc4ed4c9289b
--- /dev/null
+++ b/.owners.yml
@@ -0,0 +1,16 @@
+assign:
+  issues: enabled
+  pull_requests: disabled
+  strategy:
+    # random
+    daily-shift-based
+  scedule:
+    '*/1 * * * *'
+  assignees:
+    - hukkai
+    - Dai-Wenxun
+    - cir7
+    - Dai-Wenxun
+    - cir7
+    - hukkai
+    - hukkai
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..91b101351e98ee45f74446e68b7787faba0f1ab2
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,52 @@
+exclude: ^tests/data/
+repos:
+  - repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
+    hooks:
+      - id: flake8
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.11.5
+    hooks:
+      - id: isort
+  - repo: https://github.com/pre-commit/mirrors-yapf
+    rev: v0.32.0
+    hooks:
+      - id: yapf
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+  - repo: https://github.com/myint/docformatter
+    rev: v1.3.1
+    hooks:
+      - id: docformatter
+        args: ["--in-place", "--wrap-descriptions", "79"]
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.1.0
+    hooks:
+      - id: codespell
+        args: ["--skip", "*.ipynb", "-L", "ECT,Gool,tread,gool,mot"]
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.14
+    hooks:
+      - id: mdformat
+        args: ["--number", "--table-width", "200"]
+        additional_dependencies:
+          - mdformat-openmmlab
+          - mdformat_frontmatter
+          - linkify-it-py
+  - repo: https://github.com/open-mmlab/pre-commit-hooks
+    rev: v0.2.0
+    hooks:
+      - id: check-algo-readme
+      - id: check-copyright
+        args: ["mmaction", "tests", "demo", "tools"]
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000000000000000000000000000000000000..121ebd1e079a81927454c66e80d10530764e040e
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,624 @@
+[MASTER]
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-whitelist=
+
+# Specify a score threshold to be exceeded before program exits with error.
+fail-under=10
+
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=CVS,configs
+
+# Add files or directories matching the regex patterns to the blacklist. The
+# regex matches against base names, not paths.
+ignore-patterns=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use.
+jobs=1
+
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+
+# List of plugins (as comma separated values of python module names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
+confidence=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=import-outside-toplevel
+        redefined-outer-name
+        print-statement,
+        parameter-unpacking,
+        unpacking-in-except,
+        old-raise-syntax,
+        backtick,
+        long-suffix,
+        old-ne-operator,
+        old-octal-literal,
+        import-star-module-level,
+        non-ascii-bytes-literal,
+        raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead,
+        apply-builtin,
+        basestring-builtin,
+        buffer-builtin,
+        cmp-builtin,
+        coerce-builtin,
+        execfile-builtin,
+        file-builtin,
+        long-builtin,
+        raw_input-builtin,
+        reduce-builtin,
+        standarderror-builtin,
+        unicode-builtin,
+        xrange-builtin,
+        coerce-method,
+        delslice-method,
+        getslice-method,
+        setslice-method,
+        no-absolute-import,
+        old-division,
+        dict-iter-method,
+        dict-view-method,
+        next-method-called,
+        metaclass-assignment,
+        indexing-exception,
+        raising-string,
+        reload-builtin,
+        oct-method,
+        hex-method,
+        nonzero-method,
+        cmp-method,
+        input-builtin,
+        round-builtin,
+        intern-builtin,
+        unichr-builtin,
+        map-builtin-not-iterating,
+        zip-builtin-not-iterating,
+        range-builtin-not-iterating,
+        filter-builtin-not-iterating,
+        using-cmp-argument,
+        eq-without-hash,
+        div-method,
+        idiv-method,
+        rdiv-method,
+        exception-message-attribute,
+        invalid-str-codec,
+        sys-max-int,
+        bad-python3-import,
+        deprecated-string-function,
+        deprecated-str-translate-call,
+        deprecated-itertools-function,
+        deprecated-types-field,
+        next-method-defined,
+        dict-items-not-iterating,
+        dict-keys-not-iterating,
+        dict-values-not-iterating,
+        deprecated-operator-function,
+        deprecated-urllib-function,
+        xreadlines-attribute,
+        deprecated-sys-function,
+        exception-escape,
+        comprehension-escape,
+        no-member,
+        invalid-name,
+        too-many-branches,
+        wrong-import-order,
+        too-many-arguments,
+        missing-function-docstring,
+        missing-module-docstring,
+        too-many-locals,
+        too-few-public-methods,
+        abstract-method,
+        broad-except,
+        too-many-nested-blocks,
+        too-many-instance-attributes,
+        missing-class-docstring,
+        duplicate-code,
+        not-callable,
+        protected-access,
+        dangerous-default-value,
+        no-name-in-module,
+        logging-fstring-interpolation,
+        super-init-not-called,
+        redefined-builtin,
+        attribute-defined-outside-init,
+        arguments-differ,
+        cyclic-import,
+        bad-super-call,
+        too-many-statements,
+        line-too-long
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=c-extension-no-member
+
+
+[REPORTS]
+
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'error', 'warning', 'refactor', and 'convention'
+# which contain the number of messages in each category, as well as 'statement'
+# which is the total number of statements analyzed. This score is used by the
+# global evaluation report (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+#msg-template=
+
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=yes
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+# List of decorators that change the signature of a decorated function.
+signature-mutators=
+
+
+[SPELLING]
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+# Spelling dictionary name. Available dictionaries: none. To make it work,
+# install the python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains the private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[LOGGING]
+
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+logging-format-style=old
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+          _cb
+
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore.
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
+
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=no
+
+# This flag controls whether the implicit-str-concat should generate a warning
+# on implicit string concatenation in sequences defined over several lines.
+check-str-concat-over-line-jumps=no
+
+
+[SIMILARITIES]
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,
+      XXX,
+      TODO
+
+# Regular expression of note tags to take in consideration.
+#notes-rgx=
+
+
+[BASIC]
+
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style.
+#argument-rgx=
+
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style.
+#attr-rgx=
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=foo,
+          bar,
+          baz,
+          toto,
+          tutu,
+          tata
+
+# Bad variable names regexes, separated by a comma. If names match any regex,
+# they will always be refused
+bad-names-rgxs=
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style.
+#class-attribute-rgx=
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Regular expression matching correct class names. Overrides class-naming-
+# style.
+#class-rgx=
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Regular expression matching correct constant names. Overrides const-naming-
+# style.
+#const-rgx=
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Regular expression matching correct function names. Overrides function-
+# naming-style.
+#function-rgx=
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,
+           j,
+           k,
+           ex,
+           Run,
+           _,
+           x,
+           y,
+           w,
+           h,
+           a,
+           b
+
+# Good variable names regexes, separated by a comma. If names match any regex,
+# they will always be accepted
+good-names-rgxs=
+
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style.
+#inlinevar-rgx=
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Regular expression matching correct method names. Overrides method-naming-
+# style.
+#method-rgx=
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Regular expression matching correct module names. Overrides module-naming-
+# style.
+#module-rgx=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style.
+#variable-rgx=
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method.
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[IMPORTS]
+
+# List of modules that can be imported at any level, not just the top level
+# one.
+allow-any-import-level=
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=optparse,tkinter.tix
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled).
+ext-import-graph=
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled).
+import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled).
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Couples of modules and preferred modules, separated by a comma.
+preferred-modules=
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp,
+                      __post_init__
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=cls
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "BaseException, Exception".
+overgeneral-exceptions=BaseException,
+                       Exception
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000000000000000000000000000000000000..e4fba2183587225f216eeada4c78dfab6b2e65f5
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.12
diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 0000000000000000000000000000000000000000..50641e1bf8a13f12a6a90a7ea335fa53e3c8341a
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,14 @@
+version: 2
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.9"
+
+formats:
+    - epub
+
+python:
+  install:
+    - requirements: requirements/docs.txt
+    - requirements: requirements/readthedocs.txt
diff --git a/0.7.1 b/0.7.1
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/2.0.0 b/2.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/3.0.0 b/3.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..c1d3eb5836d14209fbea113ea6b9625f262022a9
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - name: "MMAction2 Contributors"
+title: "OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark"
+date-released: 2020-07-21
+url: "https://github.com/open-mmlab/mmaction2"
+license: Apache-2.0
diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
new file mode 100644
index 0000000000000000000000000000000000000000..28b16a606993196f5193812a5009798cfdeee2cd
--- /dev/null
+++ b/DEPLOYMENT.md
@@ -0,0 +1,186 @@
+# Deployment Guide for Hugging Face Spaces
+
+This guide will help you deploy the GenVidBench project on Hugging Face Spaces.
+
+## 🚀 Quick Start
+
+### 1. Prepare Your Repository
+
+Make sure your repository contains these files:
+- `app.py` - Main Gradio application
+- `requirements.txt` - Python dependencies
+- `README.md` - Space metadata and description
+- `download_model.py` - Model download script
+- `setup.py` - Setup script (optional)
+
+### 2. Create a Hugging Face Space
+
+1. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
+2. Click "Create new Space"
+3. Fill in the details:
+   - **Space name**: `genvidbench` (or your preferred name)
+   - **License**: Apache 2.0
+   - **SDK**: Gradio
+   - **Hardware**: CPU Basic (or GPU if needed)
+   - **Visibility**: Public or Private
+
+### 3. Upload Your Code
+
+You can either:
+- **Option A**: Push your code to a GitHub repository and connect it
+- **Option B**: Upload files directly through the web interface
+
+### 4. Configure the Space
+
+The space will automatically:
+- Install dependencies from `requirements.txt`
+- Run `app.py` as the main application
+- Display the Gradio interface
+
+## 📁 Required Files
+
+### `app.py`
+Your main Gradio application file. Should contain:
+- Model initialization
+- Gradio interface definition
+- Video processing logic
+
+### `requirements.txt`
+List of Python packages needed:
+```
+torch>=1.13.0
+torchvision>=0.14.0
+mmcv>=2.0.0
+mmengine>=0.7.1
+gradio>=4.0.0
+opencv-python>=4.6.0
+decord>=0.6.0
+# ... other dependencies
+```
+
+### `README.md`
+Space metadata and description:
+```yaml
+---
+title: GenVidBench - Video Action Recognition
+emoji: 🎬
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 4.0.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+```
+
+## 🔧 Model Setup
+
+### Option 1: Automatic Download
+The app will try to download the model checkpoint automatically using `download_model.py`.
+
+### Option 2: Manual Upload
+1. Download the model checkpoint manually
+2. Upload it to your space's file system
+3. Place it in the `checkpoints/` directory
+
+### Option 3: Hugging Face Hub
+Store the model on Hugging Face Hub and load it programmatically:
+```python
+from huggingface_hub import hf_hub_download
+checkpoint_path = hf_hub_download(repo_id="your-username/your-model", filename="model.pth")
+```
+
+## 🐛 Troubleshooting
+
+### Common Issues
+
+1. **Model not found**
+   - Ensure the checkpoint file is in the correct location
+   - Check file permissions
+   - Verify the download completed successfully
+
+2. **Import errors**
+   - Check that all dependencies are in `requirements.txt`
+   - Verify package versions are compatible
+
+3. **Memory issues**
+   - Consider using a smaller model
+   - Optimize batch size
+   - Use CPU instead of GPU if needed
+
+4. **Slow loading**
+   - Pre-download models during setup
+   - Use model caching
+   - Optimize model size
+
+### Debug Mode
+
+Add debug information to your app:
+```python
+import logging
+logging.basicConfig(level=logging.DEBUG)
+```
+
+## 📊 Performance Optimization
+
+### For Hugging Face Spaces
+
+1. **Model Size**: Keep models under 2GB for faster loading
+2. **Dependencies**: Minimize the number of packages
+3. **Caching**: Use model caching to avoid re-downloading
+4. **Lazy Loading**: Load models only when needed
+
+### Example Optimizations
+
+```python
+# Lazy model loading
+model = None
+
+def get_model():
+    global model
+    if model is None:
+        model = init_recognizer(config_file, checkpoint_file, device='cpu')
+    return model
+
+def analyze_video(video):
+    model = get_model()  # Load only when needed
+    # ... rest of the function
+```
+
+## 🔒 Security Considerations
+
+1. **File Upload Limits**: Set appropriate limits for video uploads
+2. **Input Validation**: Validate video formats and sizes
+3. **Resource Limits**: Monitor CPU/memory usage
+4. **Error Handling**: Graceful error handling for edge cases
+
+## 📈 Monitoring
+
+Monitor your space:
+- Check logs in the Hugging Face interface
+- Monitor resource usage
+- Track user interactions
+- Set up alerts for failures
+
+## 🚀 Going Live
+
+Once everything is working:
+
+1. **Test thoroughly** with different video types
+2. **Optimize performance** for your target audience
+3. **Add documentation** for users
+4. **Monitor usage** and gather feedback
+5. **Iterate and improve** based on user needs
+
+## 📞 Support
+
+If you encounter issues:
+- Check the Hugging Face Spaces documentation
+- Review the logs in your space
+- Test locally first
+- Ask for help in the Hugging Face community
+
+---
+
+**Happy deploying! 🎉**
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..797bf40e85c5d2986ebcec9cb51aed979ca88b82
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,203 @@
+Copyright 2018-2019 Open-MMLab. All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018-2019 Open-MMLab.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..13a0db319d94bf8988e73ee1d45954de4efe18d9
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,4 @@
+include mmaction/.mim/model-index.yml
+include mmaction/.mim/dataset-index.yml
+recursive-include mmaction/.mim/configs *.py *.yml
+recursive-include mmaction/.mim/tools *.sh *.py
diff --git a/README.md b/README.md
index 6df8ea27d36706cc95d4565976f9adf7ec92da64..d080a66e4f6f5971cfedcecab23361dee0e325b5 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,92 @@
----
-title: Deepfake Detector
-emoji: 💻
-colorFrom: pink
-colorTo: gray
-sdk: gradio
-sdk_version: 5.47.2
-app_file: app.py
-pinned: false
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+---
+title: GenVidBench - Video Action Recognition
+emoji: 🎬
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 4.0.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: State-of-the-art video action recognition using MMAction2
+---
+
+# GenVidBench - Video Action Recognition
+
+A powerful video analysis tool that uses state-of-the-art deep learning models to recognize actions and activities in videos. Built on top of MMAction2 framework with a user-friendly Gradio interface.
+
+## 🚀 Features
+
+- **Action Recognition**: Identify actions and activities in videos using TSN (Temporal Segment Networks)
+- **Top-5 Predictions**: Get the most likely actions with confidence scores
+- **Multiple Formats**: Support for MP4, AVI, MOV, and other video formats
+- **Real-time Processing**: Fast inference optimized for web deployment
+- **User-friendly Interface**: Clean and intuitive Gradio web interface
+
+## 🎯 Model Details
+
+This demo uses:
+- **Model**: TSN (Temporal Segment Networks) with ResNet-50 backbone
+- **Dataset**: Trained on Kinetics-400 dataset (400 action classes)
+- **Framework**: MMAction2 (OpenMMLab)
+- **Input**: RGB video frames
+- **Output**: Top-5 action predictions with confidence scores
+
+## 🛠️ Technical Stack
+
+- **Backend**: Python, PyTorch, MMAction2
+- **Frontend**: Gradio
+- **Video Processing**: OpenCV, Decord
+- **Deployment**: Hugging Face Spaces
+
+## 📖 How to Use
+
+1. **Upload Video**: Click the upload area or drag and drop your video file
+2. **Wait for Processing**: The model will analyze your video (usually takes a few seconds)
+3. **View Results**: See the top 5 predicted actions with confidence scores
+
+## 💡 Tips for Best Results
+
+- **Video Length**: Shorter videos (under 30 seconds) process faster
+- **Video Quality**: Clear, well-lit videos work best
+- **Action Clarity**: Videos with clear, distinct actions yield better results
+- **Supported Formats**: MP4, AVI, MOV, and other common video formats
+
+## 🔬 Supported Actions
+
+The model can recognize 400 different action classes from the Kinetics-400 dataset, including:
+- Sports activities (basketball, soccer, tennis, etc.)
+- Daily activities (cooking, cleaning, reading, etc.)
+- Physical exercises (push-ups, jumping jacks, etc.)
+- Musical activities (playing instruments, singing, etc.)
+- And many more!
+
+## 🏗️ Architecture
+
+```
+Video Input → Frame Sampling → Feature Extraction → Classification → Top-5 Predictions
+```
+
+## 📊 Performance
+
+- **Accuracy**: State-of-the-art performance on Kinetics-400
+- **Speed**: Optimized for real-time inference
+- **Memory**: Efficient GPU/CPU utilization
+
+## 🤝 Contributing
+
+This project is part of the GenVidBench framework. Contributions are welcome!
+
+## 📄 License
+
+This project is licensed under the Apache License 2.0 - see the LICENSE file for details.
+
+## 🙏 Acknowledgments
+
+- [MMAction2](https://github.com/open-mmlab/mmaction2) - The underlying framework
+- [OpenMMLab](https://openmmlab.com/) - For the excellent computer vision tools
+- [Hugging Face](https://huggingface.co/) - For the deployment platform
+
+---
+
+**Note**: This is a demonstration of video action recognition capabilities. For production use, consider additional validation and error handling.
\ No newline at end of file
diff --git a/README_zh-CN.md b/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..862d202ed532115e5c1d7d0a1b286005bdd5ac01
--- /dev/null
+++ b/README_zh-CN.md
@@ -0,0 +1,398 @@
+<div align="center">
+  <img src="https://github.com/open-mmlab/mmaction2/raw/main/resources/mmaction2_logo.png" width="600"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab 官网</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab 开放平台</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+
+[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/latest/)
+[![actions](https://github.com/open-mmlab/mmaction2/workflows/build/badge.svg)](https://github.com/open-mmlab/mmaction2/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2)
+[![PyPI](https://img.shields.io/pypi/v/mmaction2)](https://pypi.org/project/mmaction2/)
+[![LICENSE](https://img.shields.io/github/license/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/blob/main/LICENSE)
+[![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
+[![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
+
+[📘中文文档](https://mmaction2.readthedocs.io/zh_CN/latest/index.html) |
+[🛠️安装指南](https://mmaction2.readthedocs.io/zh_CN/latest/get_started/installation.html) |
+[👀模型库](https://mmaction2.readthedocs.io/zh_CN/latest/modelzoo_statistics.html) |
+[🆕更新日志](https://mmaction2.readthedocs.io/en/latest/notes/changelog.html) |
+[🚀进行中项目](https://github.com/open-mmlab/mmaction2/projects) |
+[🤔报告问题](https://github.com/open-mmlab/mmaction2/issues/new/choose)
+
+</div>
+
+<div align="center">
+  <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://discord.com/channels/1037617289144569886/1046608014234370059" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://twitter.com/OpenMMLab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://space.bilibili.com/1293512903" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.zhihu.com/people/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png" width="3%" alt="" /></a>
+</div>
+
+[English](/README.md) | 简体中文
+
+## 📄 目录
+
+- [📄 目录](#-目录)
+- [🥳 🚀 最新进展](#--最新进展-)
+- [📖 简介](#-简介-)
+- [🎁 主要功能](#-主要功能-)
+- [🛠️ 安装](#️-安装-)
+- [👀 模型库](#-模型库-)
+- [👨‍🏫 新手入门](#-新手入门-)
+- [🎫 许可证](#-许可证-)
+- [🖊️ 引用](#️-引用-)
+- [🙌 参与贡献](#-参与贡献-)
+- [🤝 致谢](#-致谢-)
+- [🏗️ OpenMMLab 的其他项目](#️-openmmlab-的其他项目-)
+- [❤️ 欢迎加入 OpenMMLab 社区](#️-欢迎加入-openmmlab-社区-)
+
+## 🥳 🚀 最新进展 [🔝](#-table-of-contents)
+
+**默认分支已经从 `master` （当前的`0.x`） 切换到 `main`（之前的 `1.x`），我们建议用户更新至最新版本，其支持更多模型，更强的预训练权重，以及更简洁的代码实现。详情请参阅[迁移指南](https://mmaction2.readthedocs.io/zh_cn/latest/migration.html)**
+
+**Release (2023.07.04)**: v1.1.0 支持以下新功能:
+
+- 支持基于 CLIP 的多模态模型: ActionCLIP(Arxiv'2021) 和 CLIP4clip(ArXiv'2022)
+- 支持丰富的 project: 手势识别, 时空行为检测 tutorial, 以及基于 [MMRazor](https://github.com/open-mmlab/mmrazor) 的知识蒸馏
+- 支持 HACS-segments 数据集(ICCV'2019), MultiSports 数据集(ICCV'2021), Kinetics-710 数据集(Arxiv'2022)
+- 支持 VideoMAE V2(CVPR'2023), VideoMAE(NeurIPS'2022) 支持时空行为检测任务
+- 支持 TCANet(CVPR'2021)
+- 支持 [纯 Python 风格的配置文件](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) 和使用 MIM 一键下载数据集
+
+## 📖 简介 [🔝](#-table-of-contents)
+
+MMAction2 是一款基于 PyTorch 开发的行为识别开源工具包，是 [open-mmlab](https://github.com/open-mmlab)  项目的一个子项目。
+
+<div align="center">
+  <img src="https://github.com/open-mmlab/mmaction2/raw/main/resources/mmaction2_overview.gif" width="380px">
+  <img src="https://user-images.githubusercontent.com/34324155/123989146-2ecae680-d9fb-11eb-916b-b9db5563a9e5.gif" width="380px">
+  <p style="font-size:1.5vw;"> Kinetics-400 数据集行为识别结果（左） 和 NTU-RGB+D-120 数据集基于骨架的行为识别结果（右）</p>
+</div>
+
+<div align="center">
+  <img src="https://user-images.githubusercontent.com/30782254/155710881-bb26863e-fcb4-458e-b0c4-33cd79f96901.gif" width="580px"/><br>
+    <p style="font-size:1.5vw;">Kinetics-400 数据集基于骨骼点的时空行为检测及视频行为识别结果</p>
+</div>
+<div align="center">
+  <img src="https://github.com/open-mmlab/mmaction2/raw/main/resources/spatio-temporal-det.gif" width="800px"/><br>
+    <p style="font-size:1.5vw;">AVA-2.1 数据集时空行为检测结果</p>
+</div>
+
+## 🎁 主要功能 [🔝](#-table-of-contents)
+
+- **模块化设计**： 我们将视频理解框架拆分成了不同模块，用户可以很方便地通过组合不同的模块来构建出自定义的视频理解框架。
+
+- **支持五种主要的视频理解任务**： MMAction2 为视频理解任务实现了多种多样的算法，包括行为识别，时序动作定位，时空动作检测，基于骨骼点的行为识别，以及视频检索。
+
+- **详尽的单元测试和文档**：我们提供了详尽的文档和 API 参考手册，以及单元测试。
+
+## 🛠️ 安装 [🔝](#-table-of-contents)
+
+MMAction2依赖于 [PyTorch](https://pytorch.org/)，[MMCV](https://github.com/open-mmlab/mmcv)，[MMEngine](https://github.com/open-mmlab/mmengine)，[MMDetection](https://github.com/open-mmlab/mmdetection) （可选）和 [MMPose](https://github.com/open-mmlab/mmpose) （可选）
+
+具体步骤请参考 [安装文档](https://mmaction2.readthedocs.io/zh_cn/latest/get_started/installation.html)。
+
+<details close>
+<summary>快速安装</summary>
+
+```shell
+conda create --name openmmlab python=3.8 -y
+conda activate open-mmlab
+conda install pytorch torchvision -c pytorch  # 该命令将自动安装最新版的 PyTorch 和 cudatoolkit，请确认此是否匹配你的当前环境。
+pip install -U openmim
+mim install mmengine
+mim install mmcv
+mim install mmdet  # 可选
+mim install mmpose  # 可选
+git clone https://github.com/open-mmlab/mmaction2.git
+cd mmaction2
+pip install -v -e .
+```
+
+</details>
+
+## 👀 模型库 [🔝](#-table-of-contents)
+
+结果及模型位于[模型库](https://mmaction2.readthedocs.io/zh_cn/latest/modelzoo_statistics.html)
+
+<details close>
+
+<summary>模型支持</summary>
+
+<table style="margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;">
+  <tr>
+    <td colspan="5" style="font-weight:bold;">行为识别</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/c3d/README.md">C3D</a> (CVPR'2014)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsn/README.md">TSN</a> (ECCV'2016)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/i3d/README.md">I3D</a> (CVPR'2017)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/c2d/README.md">C2D</a> (CVPR'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/i3d/README.md">I3D Non-Local</a> (CVPR'2018)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/r2plus1d/README.md">R(2+1)D</a> (CVPR'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/trn/README.md">TRN</a> (ECCV'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsm/README.md">TSM</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsm/README.md">TSM Non-Local</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/slowonly/README.md">SlowOnly</a> (ICCV'2019)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/slowfast/README.md">SlowFast</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/csn/README.md">CSN</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tin/README.md">TIN</a> (AAAI'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tpn/README.md">TPN</a> (CVPR'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/x3d/README.md">X3D</a> (CVPR'2020)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition_audio/resnet/README.md">MultiModality: Audio</a> (ArXiv'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tanet/README.md">TANet</a> (ArXiv'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/timesformer/README.md">TimeSformer</a> (ICML'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/projects/actionclip/README.md">ActionCLIP</a> (ArXiv'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/README.md">VideoSwin</a> (CVPR'2022)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomae/README.md">VideoMAE</a> (NeurIPS'2022)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/mvit/README.md">MViT V2</a> (CVPR'2022)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformer/README.md">UniFormer V1</a> (ICLR'2022)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformerv2/README.md">UniFormer V2</a> (Arxiv'2022)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomaev2/README.md">VideoMAE V2</a> (CVPR'2023)</td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">时序动作定位</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/localization/bsn/README.md">BSN</a> (ECCV'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/localization/bmn/README.md">BMN</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/localization/tcanet/README.md">TCANet</a> (CVPR'2021)</td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">时空行为检测</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/detection/acrn/README.md">ACRN</a> (ECCV'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/detection/slowonly/README.md">SlowOnly+Fast R-CNN</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/detection/slowfast/README.md">SlowFast+Fast R-CNN</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/detection/lfb/README.md">LFB</a> (CVPR'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomae/README.md">VideoMAE</a> (NeurIPS'2022)</td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">基于骨骼点的行为识别</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/skeleton/stgcn/README.md">ST-GCN</a> (AAAI'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/skeleton/2s-agcn/README.md">2s-AGCN</a> (CVPR'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/skeleton/posec3d/README.md">PoseC3D</a> (CVPR'2022)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/skeleton/stgcnpp/README.md">STGCN++</a> (ArXiv'2022)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/projects/ctrgcn/README.md">CTRGCN</a> (CVPR'2021)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/projects/msg3d/README.md">MSG3D</a> (CVPR'2020)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">视频检索</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/retrieval/clip4clip/README.md">CLIP4Clip</a> (ArXiv'2022)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+
+</table>
+
+</details>
+
+<details close>
+
+<summary>数据集支持</summary>
+
+<table style="margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;">
+  <tr>
+    <td colspan="4" style="font-weight:bold;">行为识别</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/hmdb51/README.md">HMDB51</a> (<a href="https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/">官网</a>) (ICCV'2011)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ucf101/README.md">UCF101</a> (<a href="https://www.crcv.ucf.edu/research/data-sets/ucf101/">官网</a>) (CRCV-IR-12-01)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/activitynet/README.md">ActivityNet</a> (<a href="http://activity-net.org/">官网</a>) (CVPR'2015)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics/README.md">Kinetics-[400/600/700]</a> (<a href="https://deepmind.com/research/open-source/kinetics/">官网</a>) (CVPR'2017)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/sthv1/README.md">SthV1</a>  (ICCV'2017)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/sthv2/README.md">SthV2</a> (<a href="https://developer.qualcomm.com/software/ai-datasets/something-something">官网</a>) (ICCV'2017)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/diving48/README.md">Diving48</a> (<a href="http://www.svcl.ucsd.edu/projects/resound/dataset.html">官网</a>) (ECCV'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/jester/README.md">Jester</a> (<a href="https://developer.qualcomm.com/software/ai-datasets/jester">官网</a>) (ICCV'2019)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/mit/README.md">Moments in Time</a> (<a href="http://moments.csail.mit.edu/">官网</a>) (TPAMI'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/mmit/README.md">Multi-Moments in Time</a> (<a href="http://moments.csail.mit.edu/challenge_iccv_2019.html">官网</a>) (ArXiv'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/hvu/README.md">HVU</a> (<a href="https://github.com/holistic-video-understanding/HVU-Dataset">官网</a>) (ECCV'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/omnisource/README.md">OmniSource</a> (<a href="https://kennymckormick.github.io/omnisource/">官网</a>) (ECCV'2020)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/gym/README.md">FineGYM</a> (<a href="https://sdolivia.github.io/FineGym/">官网</a>) (CVPR'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics710/README.md">Kinetics-710</a> (<a href="https://arxiv.org/pdf/2211.09552.pdf">官网</a>) (Arxiv'2022)</td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="4" style="font-weight:bold;">时序动作定位</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/thumos14/README.md">THUMOS14</a> (<a href="https://www.crcv.ucf.edu/THUMOS14/download.html">官网</a>) (THUMOS Challenge 2014)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/activitynet/README.md">ActivityNet</a> (<a href="http://activity-net.org/">官网</a>) (CVPR'2015)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/hacs/README.md">HACS</a> (<a href="https://github.com/hangzhaomit/HACS-dataset">官网</a>) (ICCV'2019)</td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="4" style="font-weight:bold;">时空行为检测</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ucf101_24/README.md">UCF101-24*</a> (<a href="http://www.thumos.info/download.html">官网</a>) (CRCV-IR-12-01)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/jhmdb/README.md">JHMDB*</a> (<a href="http://jhmdb.is.tue.mpg.de/">官网</a>) (ICCV'2015)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ava/README.md">AVA</a> (<a href="https://research.google.com/ava/index.html">官网</a>) (CVPR'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ava_kinetics/README.md">AVA-Kinetics</a> (<a href="https://research.google.com/ava/index.html">官网</a>) (Arxiv'2020)</td>
+  </tr>
+  <tr>
+    <td colspan="4" style="font-weight:bold;">基于骨架的行为识别</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-FineGYM</a> (<a href="https://kennymckormick.github.io/posec3d/">官网</a>) (ArXiv'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-NTURGB+D</a> (<a href="https://kennymckormick.github.io/posec3d/">官网</a>) (ArXiv'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-UCF101</a> (<a href="https://kennymckormick.github.io/posec3d/">官网</a>) (ArXiv'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-HMDB51</a> (<a href="https://kennymckormick.github.io/posec3d/">官网</a>) (ArXiv'2021)</td>
+  </tr>
+  <tr>
+    <td colspan="4" style="font-weight:bold;">视频检索</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/video_retrieval/README.md">MSRVTT</a> (<a href="https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/">官网</a>) (CVPR'2016)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+</table>
+
+</details>
+
+## 👨‍🏫 新手入门 [🔝](#-table-of-contents)
+
+我们提供了一系列简明的教程，帮助新用户轻松上手使用：
+
+- [从 MMAction2 0.X 迁移](https://mmaction2.readthedocs.io/zh_cn/latest/migration.html)
+- [学习配置相关知识](https://mmaction2.readthedocs.io/zh_cn/latest/user_guides/config.html)
+- [准备数据集](https://mmaction2.readthedocs.io/zh_cn/latest/user_guides/prepare_dataset.html)
+- [使用现有模型进行推理](https://mmaction2.readthedocs.io/zh_cn/latest/user_guides/inference.html)
+- [训练与测试](https://mmaction2.readthedocs.io/zh_cn/latest/user_guides/train_test.html)
+
+<details close>
+<summary>基于 MMAction2 的社区工作</summary>
+
+- Video Swin Transformer. [\[paper\]](https://arxiv.org/abs/2106.13230)[\[github\]](https://github.com/SwinTransformer/Video-Swin-Transformer)
+- Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2107.10161)[\[github\]](https://github.com/Cogito2012/DEAR)
+- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2103.17263)[\[github\]](https://github.com/xvjiarui/VFS)
+
+</details>
+
+## 🎫 许可证 [🔝](#-table-of-contents)
+
+本项目基于 [Apache 2.0 license](LICENSE) 发布。
+
+## 🖊️ 引用 [🔝](#-table-of-contents)
+
+如你发现本项目对你的研究有帮助，请参考如下 bibtex 引用 MMAction2。
+
+```BibTeX
+@misc{2020mmaction2,
+    title={OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark},
+    author={MMAction2 Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmaction2}},
+    year={2020}
+}
+```
+
+## 🙌 参与贡献 [🔝](#-table-of-contents)
+
+我们感谢所有的贡献者为改进和提升 MMAction2 所作出的努力。请参考[贡献指南](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING.md)来了解参与项目贡献的相关指引。
+
+## 🤝 致谢 [🔝](#-table-of-contents)
+
+MMAction2 是一款由来自不同高校和企业的研发人员共同参与贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者，以及提供宝贵反馈的用户。 我们希望此工具箱可以帮助大家来复现已有的方法和开发新的方法，从而为研究社区贡献力量。
+
+## 🏗️ OpenMMLab 的其他项目 [🔝](#-table-of-contents)
+
+- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
+- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
+- [MMEval](https://github.com/open-mmlab/mmeval): 统一开放的跨框架算法评测库
+- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab 深度学习预训练工具箱
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
+- [MMagic](https://github.com/open-mmlab/mmagic): OpenMMLab 新一代人工智能内容生成（AIGC）工具箱
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
+- [Playground](https://github.com/open-mmlab/playground): 收集和展示 OpenMMLab 相关的前沿、有趣的社区项目
+
+## ❤️ 欢迎加入 OpenMMLab 社区 [🔝](#-table-of-contents)
+
+扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，扫描下方微信二维码添加喵喵好友，进入 MMAction2 微信交流社群。【加好友申请格式：研究方向+地区+学校/公司+姓名】
+
+<div align="center">
+<img src="./resources/zhihu_qrcode.jpg" height="400"/> <img src="./resources/miaomiao_qrcode.jpg" height="400"/>
+</div>
+
+我们会在 OpenMMLab 社区为大家
+
+- 📢 分享 AI 框架的前沿核心技术
+- 💻 解读 PyTorch 常用模块源码
+- 📰 发布 OpenMMLab 的相关新闻
+- 🚀 介绍 OpenMMLab 开发的前沿算法
+- 🏃 获取更高效的问题答疑和意见反馈
+- 🔥 提供与各行各业开发者充分交流的平台
+
+干货满满 📘，等你来撩 💗，OpenMMLab 社区期待您的加入 👬
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf7c295699c6f05464bfde254cd65070ca0c2291
--- /dev/null
+++ b/app.py
@@ -0,0 +1,115 @@
+import os
+import torch
+from operator import itemgetter
+from mmaction.apis import init_recognizer, inference_recognizer
+import gradio as gr
+
+# Set paths for Hugging Face Spaces
+config_file = 'demo/demo_configs/tsn_r50_1x1x8_video_infer.py'
+checkpoint_file = 'checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth'
+
+# Download model checkpoint if it doesn't exist
+def download_checkpoint():
+    if not os.path.exists(checkpoint_file):
+        os.makedirs('checkpoints', exist_ok=True)
+        print("Model checkpoint not found. Please run 'python download_model.py' to download it.")
+        print("Or place the checkpoint file manually at:", checkpoint_file)
+        return False
+    return True
+
+# Initialize model
+print("Initializing model...")
+if not download_checkpoint():
+    print("❌ Cannot initialize model without checkpoint. Exiting...")
+    exit(1)
+
+try:
+    model = init_recognizer(config_file, checkpoint_file, device='cpu')
+    print("✅ Model loaded successfully!")
+except Exception as e:
+    print(f"❌ Error loading model: {e}")
+    print("Please check that the config file and checkpoint are correct.")
+    exit(1)
+# test a single video and show the result:
+# video = 'demo.mp4'
+# label = '../tools/data/kinetics/label_map_k400.txt'
+# results = inference_recognizer(model, video)
+
+# pred_scores = results.pred_score.tolist()
+# score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))
+# score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)
+# top5_label = score_sorted[:5]
+
+# labels = open(label).readlines()
+# labels = [x.strip() for x in labels]
+# results = [(labels[k[0]], k[1]) for k in top5_label]
+
+
+# # show the results
+# for result in results:
+#     print(f'{result[0]}: ', result[1])
+
+
+def analyze_video(video):
+    """Analyze video for action recognition"""
+    try:
+        if video is None:
+            return "Please upload a video file."
+        
+        print(f"Processing video: {video}")
+        results = inference_recognizer(model, video)
+        
+        # Format results nicely
+        if hasattr(results, 'pred_score'):
+            pred_scores = results.pred_score.tolist()
+            score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))
+            score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)
+            top5_label = score_sorted[:5]
+            
+            # Load labels if available
+            label_file = 'tools/data/kinetics/label_map_k400.txt'
+            if os.path.exists(label_file):
+                with open(label_file, 'r') as f:
+                    labels = [x.strip() for x in f.readlines()]
+                results_formatted = [(labels[k[0]], f"{k[1]:.4f}") for k in top5_label]
+            else:
+                results_formatted = [(f"Class {k[0]}", f"{k[1]:.4f}") for k in top5_label]
+            
+            result_text = "Top 5 Predictions:\n"
+            for i, (label, score) in enumerate(results_formatted, 1):
+                result_text += f"{i}. {label}: {score}\n"
+            
+            return result_text
+        else:
+            return f"Analysis complete. Raw result: {results}"
+            
+    except Exception as e:
+        return f"Error processing video: {str(e)}"
+
+# Create Gradio interface
+demo = gr.Interface(
+    fn=analyze_video,
+    inputs=gr.Video(label="Upload Video", height=300),
+    outputs=gr.Textbox(label="Analysis Results", lines=10),
+    title="🎬 GenVidBench - Video Action Recognition",
+    description="""
+    Upload a video to analyze its content using state-of-the-art action recognition models.
+    This demo uses TSN (Temporal Segment Networks) trained on Kinetics-400 dataset.
+    
+    **Supported formats:** MP4, AVI, MOV, etc.
+    **Max duration:** Recommended under 30 seconds for faster processing.
+    """,
+    examples=[
+        ["demo/demo.mp4"] if os.path.exists("demo/demo.mp4") else None
+    ],
+    cache_examples=False,
+    theme=gr.themes.Soft(),
+    allow_flagging="never"
+)
+
+if __name__ == "__main__":
+    demo.launch()
+
+
+
+
diff --git a/checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth b/checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7655e0930a9f8d79b633d7892f0c9ae1b3557f49
--- /dev/null
+++ b/checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2692d16c712e24994aaa3cfb48f957a521e053ffb81c474e2c0b3e579c888650
+size 97641409
diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..25c9e4c3342f63b162455223cb63ef63eafb2ae8
--- /dev/null
+++ b/configs/_base_/default_runtime.py
@@ -0,0 +1,24 @@
+default_scope = 'mmaction'
+
+default_hooks = dict(
+    runtime_info=dict(type='RuntimeInfoHook'),
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=20, ignore_last=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=1, save_best='auto'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    sync_buffers=dict(type='SyncBuffersHook'))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends)
+
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/configs/_base_/models/audioonly_r50.py b/configs/_base_/models/audioonly_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..108cce24c978b80a53c819ccfbe68fb6b36f1aac
--- /dev/null
+++ b/configs/_base_/models/audioonly_r50.py
@@ -0,0 +1,16 @@
+# model settings
+model = dict(
+    type='RecognizerAudio',
+    backbone=dict(
+        type='ResNetAudio',
+        depth=50,
+        pretrained=None,
+        in_channels=1,
+        norm_eval=False),
+    cls_head=dict(
+        type='TSNAudioHead',
+        num_classes=400,
+        in_channels=1024,
+        dropout_ratio=0.5,
+        init_std=0.01,
+        average_clips='prob'))
diff --git a/configs/_base_/models/bmn_400x100.py b/configs/_base_/models/bmn_400x100.py
new file mode 100644
index 0000000000000000000000000000000000000000..22178c00124aa9587b826444f3630cf16b0442f7
--- /dev/null
+++ b/configs/_base_/models/bmn_400x100.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='BMN',
+    temporal_dim=100,
+    boundary_ratio=0.5,
+    num_samples=32,
+    num_samples_per_bin=3,
+    feat_dim=400,
+    soft_nms_alpha=0.4,
+    soft_nms_low_threshold=0.5,
+    soft_nms_high_threshold=0.9,
+    post_process_top_k=100)
diff --git a/configs/_base_/models/bsn_pem.py b/configs/_base_/models/bsn_pem.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d5910c46fe935a305baea9d5fc24ac0c27800c8
--- /dev/null
+++ b/configs/_base_/models/bsn_pem.py
@@ -0,0 +1,13 @@
+# model settings
+model = dict(
+    type='PEM',
+    pem_feat_dim=32,
+    pem_hidden_dim=256,
+    pem_u_ratio_m=1,
+    pem_u_ratio_l=2,
+    pem_high_temporal_iou_threshold=0.6,
+    pem_low_temporal_iou_threshold=0.2,
+    soft_nms_alpha=0.75,
+    soft_nms_low_threshold=0.65,
+    soft_nms_high_threshold=0.9,
+    post_process_top_k=100)
diff --git a/configs/_base_/models/bsn_tem.py b/configs/_base_/models/bsn_tem.py
new file mode 100644
index 0000000000000000000000000000000000000000..07433c95fc8dc8e7255933369f7b58ff3659c200
--- /dev/null
+++ b/configs/_base_/models/bsn_tem.py
@@ -0,0 +1,8 @@
+# model settings
+model = dict(
+    type='TEM',
+    temporal_dim=100,
+    boundary_ratio=0.1,
+    tem_feat_dim=400,
+    tem_hidden_dim=512,
+    tem_match_threshold=0.5)
diff --git a/configs/_base_/models/c2d_r50.py b/configs/_base_/models/c2d_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..48943d3696913f8fecbb25055e094755d5ecd34b
--- /dev/null
+++ b/configs/_base_/models/c2d_r50.py
@@ -0,0 +1,20 @@
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='C2D',
+        depth=50,
+        pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth',
+        norm_eval=False),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        init_std=0.01,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
diff --git a/configs/_base_/models/c3d_sports1m_pretrained.py b/configs/_base_/models/c3d_sports1m_pretrained.py
new file mode 100644
index 0000000000000000000000000000000000000000..396e0910a7404be2bd5e6470b8d0244a4f4bba07
--- /dev/null
+++ b/configs/_base_/models/c3d_sports1m_pretrained.py
@@ -0,0 +1,28 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='C3D',
+        pretrained=  # noqa: E251
+        'https://download.openmmlab.com/mmaction/recognition/c3d/c3d_sports1m_pretrain_20201016-dcc47ddc.pth',  # noqa: E501
+        style='pytorch',
+        conv_cfg=dict(type='Conv3d'),
+        norm_cfg=None,
+        act_cfg=dict(type='ReLU'),
+        dropout_ratio=0.5,
+        init_std=0.005),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=101,
+        in_channels=4096,
+        spatial_type=None,
+        dropout_ratio=0.5,
+        init_std=0.01,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[104, 117, 128],
+        std=[1, 1, 1],
+        format_shape='NCTHW'),
+    train_cfg=None,
+    test_cfg=None)
diff --git a/configs/_base_/models/i3d_r50.py b/configs/_base_/models/i3d_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..6308077eb36edf3db194c7cabc5566ec931798fd
--- /dev/null
+++ b/configs/_base_/models/i3d_r50.py
@@ -0,0 +1,30 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3d',
+        pretrained2d=True,
+        pretrained='torchvision://resnet50',
+        depth=50,
+        conv1_kernel=(5, 7, 7),
+        conv1_stride_t=2,
+        pool1_stride_t=2,
+        conv_cfg=dict(type='Conv3d'),
+        norm_eval=False,
+        inflate=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
+        zero_init_residual=False),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        init_std=0.01,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
+
+# This setting refers to https://github.com/open-mmlab/mmaction/blob/master/mmaction/models/tenons/backbones/resnet_i3d.py#L329-L332  # noqa: E501
diff --git a/configs/_base_/models/ircsn_r152.py b/configs/_base_/models/ircsn_r152.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbdcd615e70539a10caeeddbbba72b6d8124f416
--- /dev/null
+++ b/configs/_base_/models/ircsn_r152.py
@@ -0,0 +1,28 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dCSN',
+        pretrained2d=False,
+        pretrained=None,
+        depth=152,
+        with_pool2=False,
+        bottleneck_mode='ir',
+        norm_eval=False,
+        zero_init_residual=False),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        init_std=0.01,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(max_testing_views=10))
diff --git a/configs/_base_/models/mvit_small.py b/configs/_base_/models/mvit_small.py
new file mode 100644
index 0000000000000000000000000000000000000000..44fb80d6565f6481101d057265c0f8048a31c1fd
--- /dev/null
+++ b/configs/_base_/models/mvit_small.py
@@ -0,0 +1,14 @@
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(type='MViT', arch='small', drop_path_rate=0.2),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    cls_head=dict(
+        type='MViTHead',
+        in_channels=768,
+        num_classes=400,
+        label_smooth_eps=0.1,
+        average_clips='prob'))
diff --git a/configs/_base_/models/r2plus1d_r34.py b/configs/_base_/models/r2plus1d_r34.py
new file mode 100644
index 0000000000000000000000000000000000000000..7650de7d9144e9bc455dc8569bbe3f02294a9be0
--- /dev/null
+++ b/configs/_base_/models/r2plus1d_r34.py
@@ -0,0 +1,31 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet2Plus1d',
+        depth=34,
+        pretrained=None,
+        pretrained2d=False,
+        norm_eval=False,
+        conv_cfg=dict(type='Conv2plus1d'),
+        norm_cfg=dict(type='SyncBN', requires_grad=True, eps=1e-3),
+        conv1_kernel=(3, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(1, 1, 1, 1),
+        spatial_strides=(1, 2, 2, 2),
+        temporal_strides=(1, 2, 2, 2),
+        zero_init_residual=False),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=400,
+        in_channels=512,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        init_std=0.01,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
diff --git a/configs/_base_/models/slowfast_r50.py b/configs/_base_/models/slowfast_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4510d2ff93c1840c25645cc09adb22b745a8040
--- /dev/null
+++ b/configs/_base_/models/slowfast_r50.py
@@ -0,0 +1,42 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowFast',
+        pretrained=None,
+        resample_rate=8,  # tau
+        speed_ratio=8,  # alpha
+        channel_ratio=8,  # beta_inv
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=True,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1),
+            norm_eval=False),
+        fast_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=False,
+            base_channels=8,
+            conv1_kernel=(5, 7, 7),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            norm_eval=False)),
+    cls_head=dict(
+        type='SlowFastHead',
+        in_channels=2304,  # 2048+256
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
diff --git a/configs/_base_/models/slowonly_r50.py b/configs/_base_/models/slowonly_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9ab55a92e52ff8305528c648cf1f8015ac10276
--- /dev/null
+++ b/configs/_base_/models/slowonly_r50.py
@@ -0,0 +1,24 @@
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=50,
+        pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth',
+        lateral=False,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(0, 0, 1, 1),
+        norm_eval=False),
+    cls_head=dict(
+        type='I3DHead',
+        in_channels=2048,
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
diff --git a/configs/_base_/models/swin_tiny.py b/configs/_base_/models/swin_tiny.py
new file mode 100644
index 0000000000000000000000000000000000000000..6186e7c79e19894fa4268365008e8985f2db0714
--- /dev/null
+++ b/configs/_base_/models/swin_tiny.py
@@ -0,0 +1,28 @@
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='SwinTransformer3D',
+        arch='tiny',
+        pretrained=None,
+        pretrained2d=True,
+        patch_size=(2, 4, 4),
+        window_size=(8, 7, 7),
+        mlp_ratio=4.,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        patch_norm=True),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    cls_head=dict(
+        type='I3DHead',
+        in_channels=768,
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        average_clips='prob'))
diff --git a/configs/_base_/models/tanet_r50.py b/configs/_base_/models/tanet_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..a82926fa2bd9f7d31063ccde0eaff8b61ac62acc
--- /dev/null
+++ b/configs/_base_/models/tanet_r50.py
@@ -0,0 +1,23 @@
+# model settings
+model = dict(
+    type='Recognizer2D',
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.5],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCHW'),
+    backbone=dict(
+        type='TANet',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        num_segments=8,
+        tam_cfg=None),
+    cls_head=dict(
+        type='TSMHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.001,
+        average_clips='prob'))
diff --git a/configs/_base_/models/tin_r50.py b/configs/_base_/models/tin_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..6de4d71ff6e5b8f9083250f2b56a157e81fa6dad
--- /dev/null
+++ b/configs/_base_/models/tin_r50.py
@@ -0,0 +1,29 @@
+# model settings
+
+preprocess_cfg = dict(
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    format_shape='NCHW')
+
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNetTIN',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        norm_eval=False,
+        shift_div=4),
+    cls_head=dict(
+        type='TSMHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.001,
+        is_shift=False,
+        average_clips='prob'),
+    data_preprocessor=dict(type='ActionDataPreprocessor', **preprocess_cfg),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=None)
diff --git a/configs/_base_/models/tpn_slowonly_r50.py b/configs/_base_/models/tpn_slowonly_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ba75c93ad4f128f6a51553adfd0a558405e289c
--- /dev/null
+++ b/configs/_base_/models/tpn_slowonly_r50.py
@@ -0,0 +1,45 @@
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=50,
+        pretrained='torchvision://resnet50',
+        lateral=False,
+        out_indices=(2, 3),
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(0, 0, 1, 1),
+        norm_eval=False),
+    neck=dict(
+        type='TPN',
+        in_channels=(1024, 2048),
+        out_channels=1024,
+        spatial_modulation_cfg=dict(
+            in_channels=(1024, 2048), out_channels=2048),
+        temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
+        upsample_cfg=dict(scale_factor=(1, 1, 1)),
+        downsample_cfg=dict(downsample_scale=(1, 1, 1)),
+        level_fusion_cfg=dict(
+            in_channels=(1024, 1024),
+            mid_channels=(1024, 1024),
+            out_channels=2048,
+            downsample_scales=((1, 1, 1), (1, 1, 1))),
+        aux_head_cfg=dict(out_channels=400, loss_weight=0.5)),
+    cls_head=dict(
+        type='TPNHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.01,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(fcn_test=True))
diff --git a/configs/_base_/models/tpn_tsm_r50.py b/configs/_base_/models/tpn_tsm_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..9074c9cc394d149433959ee60f34db19ea81f9ad
--- /dev/null
+++ b/configs/_base_/models/tpn_tsm_r50.py
@@ -0,0 +1,40 @@
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNetTSM',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        out_indices=(2, 3),
+        norm_eval=False,
+        shift_div=8),
+    neck=dict(
+        type='TPN',
+        in_channels=(1024, 2048),
+        out_channels=1024,
+        spatial_modulation_cfg=dict(
+            in_channels=(1024, 2048), out_channels=2048),
+        temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
+        upsample_cfg=dict(scale_factor=(1, 1, 1)),
+        downsample_cfg=dict(downsample_scale=(1, 1, 1)),
+        level_fusion_cfg=dict(
+            in_channels=(1024, 1024),
+            mid_channels=(1024, 1024),
+            out_channels=2048,
+            downsample_scales=((1, 1, 1), (1, 1, 1))),
+        aux_head_cfg=dict(out_channels=174, loss_weight=0.5)),
+    cls_head=dict(
+        type='TPNHead',
+        num_classes=174,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.01,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCHW'),
+    train_cfg=None,
+    test_cfg=dict(fcn_test=True))
diff --git a/configs/_base_/models/trn_r50.py b/configs/_base_/models/trn_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..88caf404767c06312797da45082229b370187650
--- /dev/null
+++ b/configs/_base_/models/trn_r50.py
@@ -0,0 +1,25 @@
+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNet',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        norm_eval=False,
+        partial_bn=True),
+    cls_head=dict(
+        type='TRNHead',
+        num_classes=400,
+        in_channels=2048,
+        num_segments=8,
+        spatial_type='avg',
+        relation_type='TRNMultiScale',
+        hidden_dim=256,
+        dropout_ratio=0.8,
+        init_std=0.001,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCHW'))
diff --git a/configs/_base_/models/tsm_mobilenet_v2.py b/configs/_base_/models/tsm_mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b8fa9239ea2f67cea8dfd2eb6c16aca0964230c
--- /dev/null
+++ b/configs/_base_/models/tsm_mobilenet_v2.py
@@ -0,0 +1,27 @@
+# model settings
+preprocess_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='MobileNetV2TSM',
+        shift_div=8,
+        num_segments=8,
+        is_shift=True,
+        pretrained='mmcls://mobilenet_v2'),
+    cls_head=dict(
+        type='TSMHead',
+        num_segments=8,
+        num_classes=400,
+        in_channels=1280,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.001,
+        is_shift=True,
+        average_clips='prob'),
+    # model training and testing settings
+    data_preprocessor=dict(type='ActionDataPreprocessor', **preprocess_cfg),
+    train_cfg=None,
+    test_cfg=None)
diff --git a/configs/_base_/models/tsm_mobileone_s4.py b/configs/_base_/models/tsm_mobileone_s4.py
new file mode 100644
index 0000000000000000000000000000000000000000..27b5a410e3d35e5a61ec35107f630ebc98d26a07
--- /dev/null
+++ b/configs/_base_/models/tsm_mobileone_s4.py
@@ -0,0 +1,31 @@
+# model settings
+preprocess_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+
+checkpoint = ('https://download.openmmlab.com/mmclassification/'
+              'v0/mobileone/mobileone-s4_8xb32_in1k_20221110-28d888cb.pth')
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='MobileOneTSM',
+        arch='s4',
+        shift_div=8,
+        num_segments=8,
+        is_shift=True,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint, prefix='backbone')),
+    cls_head=dict(
+        type='TSMHead',
+        num_segments=8,
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.001,
+        is_shift=True,
+        average_clips='prob'),
+    # model training and testing settings
+    data_preprocessor=dict(type='ActionDataPreprocessor', **preprocess_cfg),
+    train_cfg=None,
+    test_cfg=None)
diff --git a/configs/_base_/models/tsm_r50.py b/configs/_base_/models/tsm_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..23b1eda5f10bbf0d24623d9b3eeb72da87d07654
--- /dev/null
+++ b/configs/_base_/models/tsm_r50.py
@@ -0,0 +1,24 @@
+preprocess_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNetTSM',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        norm_eval=False,
+        shift_div=8),
+    cls_head=dict(
+        type='TSMHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.001,
+        is_shift=True,
+        average_clips='prob'),
+    data_preprocessor=dict(type='ActionDataPreprocessor', **preprocess_cfg),
+    train_cfg=None,
+    test_cfg=None)
diff --git a/configs/_base_/models/tsn_mobileone_s0.py b/configs/_base_/models/tsn_mobileone_s0.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a67c9617cc3eeee3ad61e061284251453ba29fc
--- /dev/null
+++ b/configs/_base_/models/tsn_mobileone_s0.py
@@ -0,0 +1,26 @@
+checkpoint = ('https://download.openmmlab.com/mmclassification/'
+              'v0/mobileone/mobileone-s0_8xb32_in1k_20221110-0bc94952.pth')
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='mmpretrain.MobileOne',
+        arch='s0',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint, prefix='backbone'),
+        norm_eval=False),
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=400,
+        in_channels=1024,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.4,
+        init_std=0.01,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCHW'),
+    train_cfg=None,
+    test_cfg=None)
diff --git a/configs/_base_/models/tsn_r50.py b/configs/_base_/models/tsn_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..c277f9ae628cc6d833e8c6f63ae34eafa4bf2595
--- /dev/null
+++ b/configs/_base_/models/tsn_r50.py
@@ -0,0 +1,23 @@
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNet',
+        pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth',
+        depth=50,
+        norm_eval=False),
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.4,
+        init_std=0.01,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCHW'),
+    train_cfg=None,
+    test_cfg=None)
diff --git a/configs/_base_/models/x3d.py b/configs/_base_/models/x3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9a8a3b661d864f02152f73711fa3c26b9e9ab06
--- /dev/null
+++ b/configs/_base_/models/x3d.py
@@ -0,0 +1,20 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(type='X3D', gamma_w=1, gamma_b=2.25, gamma_d=2.2),
+    cls_head=dict(
+        type='X3DHead',
+        in_channels=432,
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        fc1_bias=False,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.38, 57.38, 57.38],
+        format_shape='NCTHW'),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=None)
diff --git a/configs/_base_/schedules/adam_20e.py b/configs/_base_/schedules/adam_20e.py
new file mode 100644
index 0000000000000000000000000000000000000000..45e5552591c4bad744e2ce2188b30372a24e7faa
--- /dev/null
+++ b/configs/_base_/schedules/adam_20e.py
@@ -0,0 +1,20 @@
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=20,
+        by_epoch=True,
+        milestones=[10],
+        gamma=0.1)
+]
+
+optimizer = dict(
+    type='Adam', lr=0.01, weight_decay=0.00001)  # this lr is used for 1 gpus
+
+optim_wrapper = dict(
+    optimizer=optimizer, clip_grad=dict(max_norm=40, norm_type=2))
diff --git a/configs/_base_/schedules/sgd_100e.py b/configs/_base_/schedules/sgd_100e.py
new file mode 100644
index 0000000000000000000000000000000000000000..43ae5ef12a30ebfe9bdcfc6898227213055fdda3
--- /dev/null
+++ b/configs/_base_/schedules/sgd_100e.py
@@ -0,0 +1,18 @@
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=100,
+        by_epoch=True,
+        milestones=[40, 80],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
diff --git a/configs/_base_/schedules/sgd_150e_warmup.py b/configs/_base_/schedules/sgd_150e_warmup.py
new file mode 100644
index 0000000000000000000000000000000000000000..3360c6a2c68c99d642fa8cd22b98d79775b517c1
--- /dev/null
+++ b/configs/_base_/schedules/sgd_150e_warmup.py
@@ -0,0 +1,19 @@
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=150, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=10),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=150,
+        by_epoch=True,
+        milestones=[90, 130],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
diff --git a/configs/_base_/schedules/sgd_50e.py b/configs/_base_/schedules/sgd_50e.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6e8d185bde5ba7f019491fd8d700f44097a0b57
--- /dev/null
+++ b/configs/_base_/schedules/sgd_50e.py
@@ -0,0 +1,18 @@
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,
+        by_epoch=True,
+        milestones=[20, 40],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
diff --git a/configs/_base_/schedules/sgd_tsm_100e.py b/configs/_base_/schedules/sgd_tsm_100e.py
new file mode 100644
index 0000000000000000000000000000000000000000..52972c7655f392fb4cf2f69552b08df93b55629c
--- /dev/null
+++ b/configs/_base_/schedules/sgd_tsm_100e.py
@@ -0,0 +1,20 @@
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=100,
+        by_epoch=True,
+        milestones=[40, 80],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    constructor='TSMOptimWrapperConstructor',
+    paramwise_cfg=dict(fc_lr5=True),
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=20, norm_type=2))
diff --git a/configs/_base_/schedules/sgd_tsm_50e.py b/configs/_base_/schedules/sgd_tsm_50e.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a9f1b561c4c6bb5f261c7912477fe671e3bbf65
--- /dev/null
+++ b/configs/_base_/schedules/sgd_tsm_50e.py
@@ -0,0 +1,20 @@
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,
+        by_epoch=True,
+        milestones=[20, 40],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    constructor='TSMOptimWrapperConstructor',
+    paramwise_cfg=dict(fc_lr5=True),
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=20, norm_type=2))
diff --git a/configs/_base_/schedules/sgd_tsm_mobilenet_v2_100e.py b/configs/_base_/schedules/sgd_tsm_mobilenet_v2_100e.py
new file mode 100644
index 0000000000000000000000000000000000000000..76d4387393ddc86bcd16de88dc47a4f41f24e720
--- /dev/null
+++ b/configs/_base_/schedules/sgd_tsm_mobilenet_v2_100e.py
@@ -0,0 +1,20 @@
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=100,
+        by_epoch=True,
+        milestones=[40, 80],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    constructor='TSMOptimWrapperConstructor',
+    paramwise_cfg=dict(fc_lr5=True),
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.00002),
+    clip_grad=dict(max_norm=20, norm_type=2))
diff --git a/configs/_base_/schedules/sgd_tsm_mobilenet_v2_50e.py b/configs/_base_/schedules/sgd_tsm_mobilenet_v2_50e.py
new file mode 100644
index 0000000000000000000000000000000000000000..a110189555458996f402664ee0aa2e065ac489ab
--- /dev/null
+++ b/configs/_base_/schedules/sgd_tsm_mobilenet_v2_50e.py
@@ -0,0 +1,20 @@
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,
+        by_epoch=True,
+        milestones=[20, 40],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    constructor='TSMOptimWrapperConstructor',
+    paramwise_cfg=dict(fc_lr5=True),
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.00002),
+    clip_grad=dict(max_norm=20, norm_type=2))
diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_genvidbench.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_genvidbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..426a5098f6de60c6d148e441bd21a4dc0bed49d4
--- /dev/null
+++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_genvidbench.py
@@ -0,0 +1,105 @@
+_base_ = [
+    '../../_base_/models/i3d_r50.py', '../../_base_/schedules/sgd_100e.py',
+    '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data'
+data_root_val = 'data'
+ann_file_train = 'data/GenVidBench/label/fake_real_label/train.txt'
+ann_file_val = 'data/GenVidBench/label/fake_real_label/test.txt'
+ann_file_test = 'data/GenVidBench/label/fake_real_label/test.txt'
+
+model = dict(cls_head=dict(num_classes=2))
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=8, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.8),
+        random_crop=False,
+        max_wh_scale_gap=0),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+default_hooks = dict(checkpoint=dict(interval=5, max_keep_ckpts=5))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_genvidbench.py b/configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_genvidbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1187742eec2781387f08b9593faae03269287d
--- /dev/null
+++ b/configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_genvidbench.py
@@ -0,0 +1,153 @@
+_base_ = [
+    '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py'
+]
+
+model = dict(
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        blending=dict(
+            type='RandomBatchAugment',
+            augments=[
+                dict(type='MixupBlending', alpha=0.8, num_classes=400),
+                dict(type='CutmixBlending', alpha=1, num_classes=400)
+            ]),
+        format_shape='NCTHW'), )
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data'
+data_root_val = 'data'
+ann_file_train = 'data/GenVidBench/label/fake_real_label/train.txt'
+ann_file_val = 'data/GenVidBench/label/fake_real_label/test.txt'
+ann_file_test = 'data/GenVidBench/label/fake_real_label/test.txt'
+
+model = dict(cls_head=dict(num_classes=2))
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=8, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='RandomErasing', erase_prob=0.25, mode='rand'),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=16,
+        frame_interval=4,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=16,
+        frame_interval=4,
+        num_clips=5,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+repeat_sample = 2
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    collate_fn=dict(type='repeat_pseudo_collate'),
+    dataset=dict(
+        type='RepeatAugDataset',
+        num_repeats=repeat_sample,
+        sample_once=True,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=200, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1.6e-3
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=1, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.01,
+        by_epoch=True,
+        begin=0,
+        end=30,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=200,
+        eta_min=base_lr / 100,
+        by_epoch=True,
+        begin=30,
+        end=200,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=512 // repeat_sample)
diff --git a/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_genvidbench.py b/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_genvidbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..03a43df355cd034813eb0d3676bf12e7772cf7ae
--- /dev/null
+++ b/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_genvidbench.py
@@ -0,0 +1,118 @@
+_base_ = [
+    '../../_base_/models/slowfast_r50.py', '../../_base_/default_runtime.py'
+]
+
+dataset_type = 'VideoDataset'
+data_root = 'data'
+data_root_val = 'data'
+ann_file_train = 'data/GenVidBench/label/fake_real_label/train.txt'
+ann_file_val = 'data/GenVidBench/label/fake_real_label/test.txt'
+ann_file_test = 'data/GenVidBench/label/fake_real_label/test.txt'
+
+model = dict(cls_head=dict(num_classes=2))
+
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=8, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=256, val_begin=1, val_interval=5)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=1e-4),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=34,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=256,
+        eta_min=0,
+        by_epoch=True,
+        begin=0,
+        end=256)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=4, max_keep_ckpts=3), logger=dict(interval=100))
diff --git a/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-genvidbench.py b/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-genvidbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4fdc4afe5792a9ba909e11822f1a51d01235490
--- /dev/null
+++ b/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-genvidbench.py
@@ -0,0 +1,138 @@
+_base_ = [
+    '../../_base_/models/swin_tiny.py', '../../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        pretrained=  # noqa: E251
+        'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_tiny_patch4_window7_224.pth'  # noqa: E501
+    ))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data'
+data_root_val = 'data'
+ann_file_train = 'data/GenVidBench/label/fake_real_label/train.txt'
+ann_file_val = 'data/GenVidBench/label/fake_real_label/test.txt'
+ann_file_test = 'data/GenVidBench/label/fake_real_label/test.txt'
+
+model = dict(cls_head=dict(num_classes=2))
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=8, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=30, val_begin=1, val_interval=3)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.02),
+    constructor='SwinOptimWrapperConstructor',
+    paramwise_cfg=dict(
+        absolute_pos_embed=dict(decay_mult=0.),
+        relative_position_bias_table=dict(decay_mult=0.),
+        norm=dict(decay_mult=0.),
+        backbone=dict(lr_mult=0.1)))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=2.5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=30,
+        eta_min=0,
+        by_epoch=True,
+        begin=0,
+        end=30)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/configs/recognition/timesformer/timesformer_genvidbench.py b/configs/recognition/timesformer/timesformer_genvidbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dc92d15faa1a9dcb1489950116784429a59483e
--- /dev/null
+++ b/configs/recognition/timesformer/timesformer_genvidbench.py
@@ -0,0 +1,147 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='TimeSformer',
+        pretrained=  # noqa: E251
+        'https://download.openmmlab.com/mmaction/recognition/timesformer/vit_base_patch16_224.pth',  # noqa: E501
+        num_frames=8,
+        img_size=224,
+        patch_size=16,
+        embed_dims=768,
+        in_channels=3,
+        dropout_ratio=0.,
+        transformer_layers=None,
+        attention_type='space_only',
+        norm_cfg=dict(type='LN', eps=1e-6)),
+    cls_head=dict(
+        type='TimeSformerHead',
+        num_classes=2,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[127.5, 127.5, 127.5],
+        std=[127.5, 127.5, 127.5],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data'
+data_root_val = 'data'
+ann_file_train = 'data/GenVidBench/label/fake_real_label/train.txt'
+ann_file_val = 'data/GenVidBench/label/fake_real_label/test.txt'
+ann_file_test = 'data/GenVidBench/label/fake_real_label/test.txt'
+
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=8, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=224),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=32,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=32,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=15, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD', lr=0.005, momentum=0.9, weight_decay=1e-4, nesterov=True),
+    paramwise_cfg=dict(
+        custom_keys={
+            '.backbone.cls_token': dict(decay_mult=0.0),
+            '.backbone.pos_embed': dict(decay_mult=0.0),
+            '.backbone.time_embed': dict(decay_mult=0.0)
+        }),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=15,
+        by_epoch=True,
+        milestones=[5, 10],
+        gamma=0.1)
+]
+
+default_hooks = dict(checkpoint=dict(interval=5))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/configs/recognition/tin/tin_genvidbench.py b/configs/recognition/tin/tin_genvidbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..befe7fa75bb176c88bf940b32f73121c682c9d83
--- /dev/null
+++ b/configs/recognition/tin/tin_genvidbench.py
@@ -0,0 +1,105 @@
+_base_ = [
+    '../../_base_/models/tin_r50.py', '../../_base_/schedules/sgd_50e.py',
+    '../../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(cls_head=dict(is_shift=True))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data'
+data_root_val = 'data'
+ann_file_train = 'data/GenVidBench/label/fake_real_label/train.txt'
+ann_file_val = 'data/GenVidBench/label/fake_real_label/test.txt'
+ann_file_test = 'data/GenVidBench/label/fake_real_label/test.txt'
+
+model = dict(cls_head=dict(num_classes=2))
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=8, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=6,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(val_interval=5)
+
+# optimizer
+optim_wrapper = dict(
+    constructor='TSMOptimWrapperConstructor', paramwise_cfg=dict(fc_lr5=True))
+load_from = 'https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth'  # noqa: E501
diff --git a/configs/recognition/tpn/tpn_genvidbench.py b/configs/recognition/tpn/tpn_genvidbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..878aef8ff32d84407b459607c080fd57ad089d64
--- /dev/null
+++ b/configs/recognition/tpn/tpn_genvidbench.py
@@ -0,0 +1,110 @@
+_base_ = [
+    '../../_base_/models/tpn_tsm_r50.py', '../../_base_/default_runtime.py'
+]
+
+dataset_type = 'VideoDataset'
+data_root = 'data'
+data_root_val = 'data'
+ann_file_train = 'data/GenVidBench/label/fake_real_label/train.txt'
+ann_file_val = 'data/GenVidBench/label/fake_real_label/test.txt'
+ann_file_test = 'data/GenVidBench/label/fake_real_label/test.txt'
+
+model = dict(cls_head=dict(num_classes=2))
+
+
+sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52}
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=8, frame_interval=1, num_clips=1),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map),
+    dict(type='ColorJitter'),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        twice_sample=True,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(img=data_root),
+        filename_tmpl='{:05}.jpg',
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(img=data_root_val),
+        filename_tmpl='{:05}.jpg',
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(img=data_root_val),
+        filename_tmpl='{:05}.jpg',
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=150, val_begin=1, val_interval=5)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=150,
+        by_epoch=True,
+        milestones=[75, 125],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True),
+    clip_grad=dict(max_norm=20, norm_type=2))
diff --git a/configs/recognition/trn/trn_genvidbench.py b/configs/recognition/trn/trn_genvidbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..3da0b5d6aa342cdf22eb3ff2e0d4d651d5c5bfd0
--- /dev/null
+++ b/configs/recognition/trn/trn_genvidbench.py
@@ -0,0 +1,123 @@
+_base_ = ['../../_base_/models/trn_r50.py', '../../_base_/default_runtime.py']
+
+# model settings
+model = dict(cls_head=dict(num_classes=2))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data'
+data_root_val = 'data'
+ann_file_train = 'data/GenVidBench/label/fake_real_label/train.txt'
+ann_file_val = 'data/GenVidBench/label/fake_real_label/test.txt'
+ann_file_test = 'data/GenVidBench/label/fake_real_label/test.txt'
+
+file_client_args = dict(io_backend='disk')
+
+sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166}
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=8, frame_interval=1, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        twice_sample=True,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+optim_wrapper = dict(
+    constructor='TSMOptimWrapperConstructor',
+    paramwise_cfg=dict(fc_lr5=False),
+    optimizer=dict(type='SGD', lr=0.002, momentum=0.9, weight_decay=5e-4),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,
+        by_epoch=True,
+        milestones=[30, 45],
+        gamma=0.1)
+]
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
+
+find_unused_parameters = True
diff --git a/configs/recognition/tsm/tsm_genvidench.py b/configs/recognition/tsm/tsm_genvidench.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf62b840e33dcdaccdf84bd1ed251afcb6c98647
--- /dev/null
+++ b/configs/recognition/tsm/tsm_genvidench.py
@@ -0,0 +1,128 @@
+_base_ = [
+    '../../_base_/models/tsm_mobilenet_v2.py',
+    '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data'
+data_root_val = 'data'
+ann_file_train = 'data/GenVidBench/label/fake_real_label/train.txt'
+ann_file_val = 'data/GenVidBench/label/fake_real_label/test.txt'
+ann_file_test = 'data/GenVidBench/label/fake_real_label/test.txt'
+
+model = dict(cls_head=dict(num_classes=2))
+
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=8, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3))
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=100,
+        by_epoch=True,
+        milestones=[40, 80],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    constructor='TSMOptimWrapperConstructor',
+    paramwise_cfg=dict(fc_lr5=True),
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00002),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=128)
diff --git a/configs/recognition/uniformerv2/README.md b/configs/recognition/uniformerv2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a596c703f0bfd1a5378a891cd087b62c2de4e27a
--- /dev/null
+++ b/configs/recognition/uniformerv2/README.md
@@ -0,0 +1,110 @@
+# UniFormerV2
+
+[UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer](https://arxiv.org/abs/2211.09552)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Learning discriminative spatiotemporal representation is the key problem of video understanding. Recently, Vision Transformers (ViTs) have shown their power in learning long-term video dependency with self-attention. Unfortunately, they exhibit limitations in tackling local video redundancy, due to the blind global comparison among tokens. UniFormer has successfully alleviated this issue, by unifying convolution and self-attention as a relation aggregator in the transformer format. However, this model has to require a tiresome and complicated image-pretraining phrase, before being finetuned on videos. This blocks its wide usage in practice. On the contrary, open-sourced ViTs are readily available and well-pretrained with rich image supervision. Based on these observations, we propose a generic paradigm to build a powerful family of video networks, by arming the pretrained ViTs with efficient UniFormer designs. We call this family UniFormerV2, since it inherits the concise style of the UniFormer block. But it contains brand-new local and global relation aggregators, which allow for preferable accuracy-computation balance by seamlessly integrating advantages from both ViTs and UniFormer. Without any bells and whistles, our UniFormerV2 gets the state-of-the-art recognition performance on 8 popular video benchmarks, including scene-related Kinetics-400/600/700 and Moments in Time, temporal-related Something-Something V1/V2, untrimmed ActivityNet and HACS. In particular, it is the first model to achieve 90% top-1 accuracy on Kinetics-400, to our best knowledge.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://raw.githubusercontent.com/OpenGVLab/UniFormerV2/main/img/framework.png"/>
+</div>
+
+## Results and Models
+
+### Kinetics-400
+
+| uniform sampling |   resolution   |        backbone        |     pretrain     | top1 acc | top5 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                         config                                                          |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
+| :--------------: | :------------: | :--------------------: | :--------------: | :------: | :------: | :-----------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        8         | short-side 320 |    UniFormerV2-B/16    |       clip       |    -     |    -     |                                         84.3                                          |                                         96.4                                          |         84.4         |         96.3         | 4 clips x 3 crop | 0.1T  |  115M  |         [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py)         |                 [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth)                 |                 [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log)                 |
+|        8         | short-side 320 |    UniFormerV2-B/16    | clip-kinetics710 |    -     |    -     |                                         85.6                                          |                                         97.0                                          |         85.8         |         97.1         | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log) |
+|        8         | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |   88.7   |   98.1   |                                         88.8                                          |                                         98.1                                          |         88.7         |         98.1         | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth)                                   |                                                                                                                  -                                                                                                                  |
+|        16        | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.0   |   98.2   |                                         89.1                                          |                                         98.2                                          |         89.0         |         98.2         | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.3   |   98.2   |                                         89.3                                          |                                         98.2                                          |         89.4         |         98.2         | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        | short-side 320 | UniFormerV2-L/14@336\* | clip-kinetics710 |   89.5   |   98.4   |                                         89.7                                          |                                         98.3                                          |         89.5         |         98.4         | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth)                                  |                                                                                                                  -                                                                                                                  |
+
+### Kinetics-600
+
+| uniform sampling | resolution |        backbone        |     pretrain     | top1 acc | top5 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                         config                                                          |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
+| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :-----------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        8         |    Raw     |    UniFormerV2-B/16    | clip-kinetics710 |    -     |    -     |                                         86.1                                          |                                         97.2                                          |         86.4         |         97.3         | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log) |
+|        8         |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.0   |   98.3   |                                         89.0                                          |                                         98.2                                          |         87.5         |         98.0         | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth)                                   |                                                                                                                  -                                                                                                                  |
+|        16        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.4   |   98.3   |                                         89.4                                          |                                         98.3                                          |         87.8         |         98.0         | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.2   |   98.3   |                                         89.5                                          |                                         98.3                                          |         87.7         |         98.1         | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        |    Raw     | UniFormerV2-L/14@336\* | clip-kinetics710 |   89.8   |   98.5   |                                         89.9                                          |                                         98.5                                          |         88.8         |         98.3         | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth)                                  |                                                                                                                  -                                                                                                                  |
+
+### Kinetics-700
+
+| uniform sampling | resolution |        backbone        |     pretrain     | top1 acc | top5 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                         config                                                          |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
+| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :-----------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        8         |    Raw     |    UniFormerV2-B/16    |       clip       |    -     |    -     |                                         75.8                                          |                                         92.8                                          |         75.9         |         92.9         | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) |                 [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth)                 |                 [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log)                 |
+|        8         |    Raw     |    UniFormerV2-B/16    | clip-kinetics710 |    -     |    -     |                                         76.3                                          |                                         92.7                                          |         76.3         |         92.9         | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log) |
+|        8         |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   80.8   |   95.2   |                                         80.8                                          |                                         95.4                                          |         79.4         |         94.8         | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth)                                   |                                                                                                                  -                                                                                                                  |
+|        16        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   81.2   |   95.6   |                                         81.2                                          |                                         95.6                                          |         79.2         |         95.0         | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   81.4   |   95.7   |                                         81.5                                          |                                         95.7                                          |         79.8         |         95.3         | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        |    Raw     | UniFormerV2-L/14@336\* | clip-kinetics710 |   82.1   |   96.0   |                                         82.1                                          |                                         96.1                                          |         80.6         |         95.6         | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-95cf9046.pth)                                  |                                                                                                                  -                                                                                                                  |
+
+### MiTv1
+
+| uniform sampling | resolution |        backbone        |           pretrain           | top1 acc | top5 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 acc | testing protocol | FLOPs | params |                                                              config                                                              |                                                                                                                                   ckpt                                                                                                                                   |                                                                                                                          log                                                                                                                          |
+| :--------------: | :--------: | :--------------------: | :--------------------------: | :------: | :------: | :-----------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :--------------: | :---: | :----: | :------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        8         |    Raw     |    UniFormerV2-B/16    | clip-kinetics710-kinetics400 |   42.3   |   71.5   |                                         42.6                                          |                                         71.7                                          | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log) |
+|        8         |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710-kinetics400 |   47.0   |   76.1   |                                         47.0                                          |                                         76.1                                          | 4 clips x 3 crop | 0.7T  |  354M  |    [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py)    |                                          [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth)                                           |                                                                                                                           -                                                                                                                           |
+|        8         |    Raw     | UniFormerV2-L/14@336\* | clip-kinetics710-kinetics400 |   47.7   |   76.8   |                                         47.8                                          |                                         76.0                                          | 4 clips x 3 crop | 1.6T  |  354M  |    [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py)    |                                          [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth)                                           |                                                                                                                           -                                                                                                                           |
+
+### Kinetics-710
+
+| uniform sampling | resolution |        backbone        | pretrain | top1 acc | top5 acc |                    config                    |                    ckpt                    |                    log                     |
+| :--------------: | :--------: | :--------------------: | :------: | :------: | :------: | :------------------------------------------: | :----------------------------------------: | :----------------------------------------: |
+|        8         |    Raw     |   UniFormerV2-B/16\*   |   clip   |   78.9   |   94.2   | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20230612-63cdbad9.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.log) |
+|        8         |    Raw     |   UniFormerV2-L/14\*   |   clip   |    -     |    -     | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20230612-d002a407.pth) |                     -                      |
+|        8         |    Raw     | UniFormerV2-L/14@336\* |   clip   |    -     |    -     | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20230612-d723ddc1.pth) |                     -                      |
+
+The models with * are ported from the repo [UniFormerV2](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) and tested on our data. Due to computational limitations, we only support reliable training config for base model (i.e. UniFormerV2-B/16).
+
+1. The values in columns named after "reference" are the results of the original repo.
+2. The values in `top1/5 acc` is tested on the same data list as the original repo, and the label map is provided by [UniFormerV2](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL).
+3. The values in columns named after "mm-Kinetics" are the testing results on the Kinetics dataset held by MMAction2, which is also used by other models in MMAction2. Due to the differences between various versions of Kinetics dataset, there is a little gap between `top1/5 acc` and `mm-Kinetics top1/5 acc`. For a fair comparison with other models, we report both results here. Note that we simply report the inference results, since the training set is different between UniFormer and other models, the results are lower than that tested on the author's version.
+4. Since the original models for Kinetics-400/600/700 adopt different [label file](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL), we simply map the weight according to the label name. New label map for Kinetics-400/600/700 can be found [here](/tools/data/kinetics).
+5. Due to some differences between [SlowFast](https://github.com/facebookresearch/SlowFast) and MMAction2, there are some gaps between their performances.
+6. Kinetics-710 is used for pretraining, which helps improve the performance on other datasets efficiently. You can find more details in the [paper](https://arxiv.org/abs/2211.09552). We also map the wegiht for Kinetics-710 checkpoints, you can find the label map [here](/tools/data/kinetics710/label_map_k710.txt).
+
+For more details on data preparation, you can refer to
+
+- [preparing_kinetics](/tools/data/kinetics/README.md)
+- [preparing_mit](/tools/data/mit/README.md)
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test UniFormerV2-B/16 model on Kinetics-400 dataset and dump the result to a pkl file.
+
+```shell
+python tools/test.py configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
+```
+
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Citation
+
+```BibTeX
+@article{Li2022UniFormerV2SL,
+  title={UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer},
+  author={Kunchang Li and Yali Wang and Yinan He and Yizhuo Li and Yi Wang and Limin Wang and Y. Qiao},
+  journal={ArXiv},
+  year={2022},
+  volume={abs/2211.09552}
+}
+```
diff --git a/configs/recognition/uniformerv2/README_zh-CN.md b/configs/recognition/uniformerv2/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..ee1f9e84c512a9feecd096dd83c510ef5d002306
--- /dev/null
+++ b/configs/recognition/uniformerv2/README_zh-CN.md
@@ -0,0 +1,98 @@
+# UniFormerV2
+
+[UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer](https://arxiv.org/abs/2211.09552)
+
+<!-- [ALGORITHM] -->
+
+## 简介
+
+```BibTeX
+@article{Li2022UniFormerV2SL,
+  title={UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer},
+  author={Kunchang Li and Yali Wang and Yinan He and Yizhuo Li and Yi Wang and Limin Wang and Y. Qiao},
+  journal={ArXiv},
+  year={2022},
+  volume={abs/2211.09552}
+}
+```
+
+## 模型库
+
+### Kinetics-400
+
+| 均匀采样帧数 |     分辨率     |        主干网络        |      与训练      | top1 准确率 | top5 准确率 | [参考文献](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 准确率 | [参考文献](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 准确率 | mm-Kinetics top1 准确率 | mm-Kinetics top5 准确率 |     测试方案     | FLOPs | 参数量 |                                                        配置文件                                                         |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
+| :----------: | :------------: | :--------------------: | :--------------: | :---------: | :---------: | :-------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------: | :---------------------: | :---------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|      8       | short-side 320 |    UniFormerV2-B/16    |       clip       |      -      |      -      |                                          84.3                                           |                                          96.4                                           |          84.4           |          96.3           | 4 clips x 3 crop | 0.1T  |  115M  |         [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py)         |                 [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth)                 |                 [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log)                 |
+|      8       | short-side 320 |    UniFormerV2-B/16    | clip-kinetics710 |      -      |      -      |                                          85.6                                           |                                          97.0                                           |          85.8           |          97.1           | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log) |
+|      8       | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |    88.7     |    98.1     |                                          88.8                                           |                                          98.1                                           |          88.7           |          98.1           | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth)                                   |                                                                                                                  -                                                                                                                  |
+|      16      | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |    89.0     |    98.2     |                                          89.1                                           |                                          98.2                                           |          89.0           |          98.2           | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth)                                  |                                                                                                                  -                                                                                                                  |
+|      32      | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |    89.3     |    98.2     |                                          89.3                                           |                                          98.2                                           |          89.4           |          98.2           | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth)                                  |                                                                                                                  -                                                                                                                  |
+|      32      | short-side 320 | UniFormerV2-L/14@336\* | clip-kinetics710 |    89.5     |    98.4     |                                          89.7                                           |                                          98.3                                           |          89.5           |          98.4           | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth)                                  |                                                                                                                  -                                                                                                                  |
+
+### Kinetics-600
+
+| 均匀采样帧数 | 分辨率 |        主干网络        |      预训练      | top1 准确率 | top5 准确率 | [参考文献](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 准确率 | [参考文献](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 准确率 | mm-Kinetics top1 准确率 | mm-Kinetics top5 准确率 |     测试方案     | FLOPs | 参数量 |                                                        配置文件                                                         |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
+| :----------: | :----: | :--------------------: | :--------------: | :---------: | :---------: | :-------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------: | :---------------------: | :---------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|      8       |  Raw   |    UniFormerV2-B/16    | clip-kinetics710 |      -      |      -      |                                          86.1                                           |                                          97.2                                           |          86.4           |          97.3           | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log) |
+|      8       |  Raw   |   UniFormerV2-L/14\*   | clip-kinetics710 |    89.0     |    98.3     |                                          89.0                                           |                                          98.2                                           |          87.5           |          98.0           | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth)                                   |                                                                                                                  -                                                                                                                  |
+|      16      |  Raw   |   UniFormerV2-L/14\*   | clip-kinetics710 |    89.4     |    98.3     |                                          89.4                                           |                                          98.3                                           |          87.8           |          98.0           | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth)                                  |                                                                                                                  -                                                                                                                  |
+|      32      |  Raw   |   UniFormerV2-L/14\*   | clip-kinetics710 |    89.2     |    98.3     |                                          89.5                                           |                                          98.3                                           |          87.7           |          98.1           | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth)                                  |                                                                                                                  -                                                                                                                  |
+|      32      |  Raw   | UniFormerV2-L/14@336\* | clip-kinetics710 |    89.8     |    98.5     |                                          89.9                                           |                                          98.5                                           |          88.8           |          98.3           | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth)                                  |                                                                                                                  -                                                                                                                  |
+
+### Kinetics-700
+
+| 均匀采样帧数 | 分辨率 |        主干网络        |      预训练      | top1 准确率 | top5 准确率 | [参考文献](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 准确率 | [参考文献](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 准确率 | mm-Kinetics top1 准确率 | mm-Kinetics top5 准确率 |     测试方案     | FLOPs | 参数量 |                                                        配置文件                                                         |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
+| :----------: | :----: | :--------------------: | :--------------: | :---------: | :---------: | :-------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------: | :---------------------: | :---------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|      8       |  Raw   |    UniFormerV2-B/16    |       clip       |      -      |      -      |                                          75.8                                           |                                          92.8                                           |          75.9           |          92.9           | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) |                 [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth)                 |                 [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log)                 |
+|      8       |  Raw   |    UniFormerV2-B/16    | clip-kinetics710 |      -      |      -      |                                          76.3                                           |                                          92.7                                           |          76.3           |          92.9           | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log) |
+|      8       |  Raw   |   UniFormerV2-L/14\*   | clip-kinetics710 |    80.8     |    95.2     |                                          80.8                                           |                                          95.4                                           |          79.4           |          94.8           | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth)                                   |                                                                                                                  -                                                                                                                  |
+|      16      |  Raw   |   UniFormerV2-L/14\*   | clip-kinetics710 |    81.2     |    95.6     |                                          81.2                                           |                                          95.6                                           |          79.2           |          95.0           | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth)                                  |                                                                                                                  -                                                                                                                  |
+|      32      |  Raw   |   UniFormerV2-L/14\*   | clip-kinetics710 |    81.4     |    95.7     |                                          81.5                                           |                                          95.7                                           |          79.8           |          95.3           | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth)                                  |                                                                                                                  -                                                                                                                  |
+|      32      |  Raw   | UniFormerV2-L/14@336\* | clip-kinetics710 |    82.1     |    96.0     |                                          82.1                                           |                                          96.1                                           |          80.6           |          95.6           | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-95cf9046.pth)                                  |                                                                                                                  -                                                                                                                  |
+
+### MiTv1
+
+| 均匀采样帧数 | 分辨率 |        主干网络        |            预训练            | top1 准确率 | top5 准确率 | [参考文献](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 准确率 | [参考文献](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 准确率 |     测试方案     | FLOPs | 参数量 |                                                              config                                                              |                                                                                                                                   ckpt                                                                                                                                   |                                                                                                                          log                                                                                                                          |
+| :----------: | :----: | :--------------------: | :--------------------------: | :---------: | :---------: | :-------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------: | :--------------: | :---: | :----: | :------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|      8       |  Raw   |    UniFormerV2-B/16    | clip-kinetics710-kinetics400 |    42.3     |    71.5     |                                          42.6                                           |                                          71.7                                           | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log) |
+|      8       |  Raw   |   UniFormerV2-L/14\*   | clip-kinetics710-kinetics400 |    47.0     |    76.1     |                                          47.0                                           |                                          76.1                                           | 4 clips x 3 crop | 0.7T  |  354M  |    [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py)    |                                          [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth)                                           |                                                                                                                           -                                                                                                                           |
+|      8       |  Raw   | UniFormerV2-L/14@336\* | clip-kinetics710-kinetics400 |    47.7     |    76.8     |                                          47.8                                           |                                          76.0                                           | 4 clips x 3 crop | 1.6T  |  354M  |    [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py)    |                                          [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth)                                           |                                                                                                                           -                                                                                                                           |
+
+### Kinetics-710
+
+| 均匀采样帧数 | 分辨率 |        主干网络        | 预训练 | top1 准确率 | top5 准确率 |                    config                     |                     ckpt                     |                     log                     |
+| :----------: | :----: | :--------------------: | :----: | :---------: | :---------: | :-------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+|      8       |  Raw   |   UniFormerV2-B/16\*   |  clip  |    78.9     |    94.2     | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20230612-63cdbad9.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.log) |
+|      8       |  Raw   |   UniFormerV2-L/14\*   |  clip  |      -      |      -      | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20230612-d002a407.pth) |                      -                      |
+|      8       |  Raw   | UniFormerV2-L/14@336\* |  clip  |      -      |      -      | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20230612-d723ddc1.pth) |                      -                      |
+
+以上带有 * 的模型是迁移自[UniFormerV2仓库](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)，并在我们的数据上进行了测试。由于算力限制，我们仅支持基础模型（即 UniFormerV2-B/16）训练配置的可靠性。
+
+1. "参考文献"列中的数值是原始仓库的结果。
+2. `top1/5准确率` 中的数值是在与原始仓库相同的数据上进行测试得到的，并且分类器-标签映射与 [UniFormerV2](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL)一致。
+3. "mm-Kinetics" 列中的数值是在 MMAction2 持有的 Kinetics 数据集上进行的测试结果，其他 MMAction2 模型也使用了该数据集。由于各个版本的 Kinetics 数据集之间存在差异，`top1/5准确率` 和 `mm-Kinetics top1/5准确率` 之间存在一些差异。为了与其他模型进行公平比较，我们在这里报告了两个结果。请注意，我们只报告推断结果，因为 UniFormer 和其他模型的训练集不同，所以该结果低于在作者版本上测试的结果。
+4. 由于 Kinetics-400/600/700 的原始模型采用了不同的[标签文件](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL)，我们根据标签名称简单映射权重。Kinetics-400/600/700的新标签映射可以在[这里](/tools/data/kinetics)找到。
+5. 由于 [SlowFast](https://github.com/facebookresearch/SlowFast)和 MMAction2 之间存在一些差异，它们的性能之间存在一些差距。
+6. 我们使用Kinetics-710进行预训练，这有助于提高其他数据集的性能。你可以在[论文](https://arxiv.org/abs/2211.09552)中找到更多细节。我们还根据 Kinetics-710 的模型权重进行了权重映射，你可以在[这里](/tools/data/kinetics710/label_map_k710.txt)找到标签映射。
+
+有关数据准备的更多详细信息，可以参考以下链接：
+
+- [准备 Kinetics 数据集](/tools/data/kinetics/README_zh-CN.md)
+- [准备 MIT 数据集](/tools/data/mit/README_zh-CN.md)
+
+## 如何测试
+
+您可以使用以下命令来测试模型：
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+示例：在 Kinetics-400 数据集上测试 UniFormerV2-B/16 模型，并将结果转储到一个pkl文件中。
+
+```shell
+python tools/test.py configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
+```
+
+有关更多详细信息，请参考[训练和测试教程](/docs/zh_cn/user_guides/train_test.md)中的**测试**部分。
diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k400.json b/configs/recognition/uniformerv2/k710_channel_map/map_k400.json
new file mode 100644
index 0000000000000000000000000000000000000000..055a6c7f3a972e1568d3c26d6a720b69afaa4871
--- /dev/null
+++ b/configs/recognition/uniformerv2/k710_channel_map/map_k400.json
@@ -0,0 +1 @@
+[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399]
diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k600.json b/configs/recognition/uniformerv2/k710_channel_map/map_k600.json
new file mode 100644
index 0000000000000000000000000000000000000000..618ee5ba988dd2f4fd3c4e03029fad7195c993c0
--- /dev/null
+++ b/configs/recognition/uniformerv2/k710_channel_map/map_k600.json
@@ -0,0 +1 @@
+[0, 661, 611, 1, 694, 2, 3, 4, 637, 5, 617, 6, 7, 639, 8, 584, 9, 618, 11, 13, 14, 15, 662, 674, 589, 16, 17, 18, 19, 20, 21, 22, 23, 603, 545, 24, 25, 26, 27, 28, 579, 29, 643, 591, 30, 31, 32, 33, 34, 660, 644, 35, 36, 37, 38, 522, 629, 39, 709, 705, 40, 599, 41, 621, 595, 42, 43, 689, 502, 504, 44, 45, 696, 46, 702, 47, 48, 49, 50, 51, 682, 52, 53, 54, 55, 505, 529, 514, 652, 708, 56, 548, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 656, 521, 69, 563, 71, 72, 73, 569, 688, 74, 75, 597, 512, 76, 77, 576, 78, 79, 636, 585, 80, 641, 81, 496, 82, 83, 84, 85, 86, 87, 88, 89, 649, 91, 586, 92, 93, 547, 94, 95, 567, 96, 97, 98, 99, 102, 103, 104, 693, 105, 106, 508, 107, 692, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 677, 120, 121, 122, 506, 627, 123, 124, 125, 517, 686, 127, 564, 128, 554, 129, 524, 130, 131, 132, 133, 134, 135, 559, 138, 571, 139, 140, 680, 141, 142, 143, 622, 144, 145, 146, 147, 148, 620, 640, 149, 150, 151, 152, 646, 153, 154, 155, 679, 156, 157, 657, 158, 647, 159, 160, 161, 162, 163, 164, 588, 12, 704, 165, 673, 166, 500, 167, 168, 169, 170, 171, 577, 172, 632, 173, 681, 174, 175, 176, 177, 178, 179, 630, 180, 494, 181, 659, 495, 650, 501, 552, 543, 519, 555, 182, 672, 560, 581, 183, 184, 185, 609, 499, 561, 568, 187, 573, 188, 189, 190, 191, 186, 192, 549, 193, 194, 195, 544, 196, 197, 675, 198, 654, 199, 638, 200, 201, 648, 539, 202, 203, 526, 204, 698, 532, 550, 205, 206, 207, 208, 209, 701, 210, 211, 136, 212, 213, 655, 666, 214, 593, 513, 580, 687, 215, 216, 217, 218, 219, 220, 221, 678, 695, 223, 224, 225, 226, 227, 228, 707, 229, 531, 230, 535, 231, 658, 232, 558, 233, 234, 235, 236, 237, 605, 525, 697, 676, 238, 542, 572, 239, 240, 615, 241, 523, 665, 242, 671, 243, 515, 244, 574, 245, 246, 247, 248, 249, 250, 251, 533, 252, 562, 253, 492, 614, 498, 608, 254, 255, 256, 257, 258, 259, 260, 261, 262, 540, 263, 700, 503, 634, 556, 590, 594, 635, 683, 264, 265, 266, 507, 267, 268, 269, 270, 272, 273, 274, 619, 275, 276, 706, 596, 277, 278, 279, 280, 528, 607, 281, 282, 283, 284, 551, 557, 285, 553, 685, 286, 536, 287, 537, 288, 289, 625, 290, 291, 292, 293, 294, 691, 295, 296, 297, 598, 298, 299, 602, 301, 642, 302, 303, 304, 100, 305, 306, 307, 309, 308, 310, 311, 497, 312, 313, 314, 315, 510, 604, 320, 316, 317, 592, 318, 319, 321, 322, 323, 324, 325, 582, 326, 327, 329, 570, 330, 623, 601, 534, 331, 332, 333, 334, 703, 336, 337, 338, 339, 340, 341, 587, 342, 669, 344, 345, 518, 690, 610, 346, 538, 348, 349, 350, 351, 352, 353, 624, 354, 355, 530, 356, 357, 358, 699, 628, 493, 578, 359, 663, 653, 509, 360, 361, 363, 364, 365, 613, 366, 367, 670, 368, 369, 370, 631, 371, 372, 565, 541, 612, 664, 566, 651, 600, 511, 645, 616, 374, 375, 520, 575, 606, 626, 377, 668, 378, 546, 379, 380, 381, 382, 583, 383, 384, 385, 527, 386, 387, 388, 389, 390, 516, 391, 392, 393, 667, 633, 394, 395, 396, 684, 397, 398, 399]
diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k700.json b/configs/recognition/uniformerv2/k710_channel_map/map_k700.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7e18b787ef8f66a3d1d434a08b2fbbaf297c659
--- /dev/null
+++ b/configs/recognition/uniformerv2/k710_channel_map/map_k700.json
@@ -0,0 +1 @@
+[0, 661, 611, 1, 694, 2, 3, 4, 637, 5, 617, 6, 7, 447, 639, 8, 584, 9, 10, 618, 11, 13, 14, 15, 662, 674, 589, 16, 17, 453, 477, 18, 19, 20, 21, 22, 23, 439, 603, 545, 24, 25, 26, 27, 28, 579, 29, 643, 484, 591, 30, 31, 32, 33, 34, 660, 435, 644, 35, 419, 36, 37, 38, 522, 629, 39, 705, 40, 599, 41, 621, 595, 42, 43, 689, 502, 504, 44, 436, 45, 696, 450, 46, 431, 702, 47, 48, 49, 50, 51, 682, 52, 53, 475, 54, 458, 55, 505, 529, 514, 652, 56, 548, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 403, 656, 521, 69, 563, 70, 71, 72, 73, 569, 688, 406, 74, 75, 597, 512, 76, 77, 470, 576, 78, 79, 636, 585, 418, 80, 641, 451, 81, 496, 82, 83, 84, 85, 86, 87, 88, 415, 89, 479, 649, 90, 91, 586, 92, 93, 547, 94, 95, 567, 96, 97, 405, 98, 99, 102, 103, 104, 693, 105, 106, 508, 107, 692, 108, 109, 110, 111, 112, 113, 114, 115, 409, 116, 117, 118, 677, 402, 119, 120, 121, 122, 506, 627, 123, 124, 125, 517, 686, 456, 126, 127, 564, 128, 554, 445, 129, 524, 130, 131, 132, 133, 134, 135, 137, 559, 138, 571, 139, 140, 680, 141, 142, 143, 622, 144, 422, 145, 146, 147, 148, 620, 640, 149, 150, 404, 486, 473, 151, 152, 646, 153, 154, 155, 679, 156, 157, 657, 158, 647, 159, 160, 161, 162, 163, 164, 588, 12, 704, 165, 673, 166, 500, 167, 168, 169, 170, 171, 577, 172, 632, 467, 173, 681, 174, 175, 176, 177, 178, 179, 630, 180, 494, 181, 659, 460, 495, 650, 501, 434, 552, 543, 468, 519, 448, 555, 182, 672, 560, 466, 581, 183, 184, 185, 609, 499, 561, 568, 187, 481, 573, 188, 442, 189, 190, 191, 186, 192, 549, 193, 194, 195, 544, 196, 490, 197, 488, 437, 675, 198, 654, 199, 638, 438, 424, 200, 201, 648, 539, 202, 203, 427, 526, 204, 698, 532, 550, 205, 206, 207, 208, 209, 701, 210, 408, 211, 136, 212, 213, 454, 655, 666, 214, 429, 593, 513, 580, 687, 215, 216, 217, 421, 218, 219, 220, 221, 678, 446, 695, 222, 223, 423, 224, 225, 226, 227, 228, 707, 229, 531, 230, 535, 231, 658, 232, 558, 233, 234, 235, 236, 237, 605, 525, 485, 697, 676, 238, 542, 401, 483, 572, 239, 240, 615, 241, 471, 523, 665, 242, 671, 243, 430, 465, 515, 244, 574, 474, 491, 245, 246, 247, 248, 249, 250, 251, 533, 252, 400, 562, 253, 413, 492, 614, 498, 440, 462, 608, 254, 463, 255, 420, 476, 256, 257, 258, 259, 260, 261, 262, 540, 263, 700, 503, 634, 556, 590, 594, 635, 416, 683, 264, 265, 266, 507, 267, 268, 269, 270, 272, 273, 274, 619, 275, 276, 706, 596, 277, 278, 279, 280, 428, 528, 607, 281, 282, 283, 433, 284, 478, 551, 557, 285, 553, 685, 286, 407, 536, 287, 537, 288, 289, 625, 290, 291, 292, 293, 294, 691, 295, 452, 296, 297, 461, 598, 298, 411, 299, 300, 602, 301, 642, 302, 443, 303, 412, 304, 100, 305, 306, 482, 307, 309, 308, 310, 311, 497, 312, 313, 314, 315, 510, 432, 604, 320, 316, 317, 592, 318, 319, 321, 322, 323, 324, 325, 582, 449, 326, 455, 327, 328, 329, 570, 330, 426, 425, 457, 623, 601, 534, 464, 331, 332, 333, 334, 703, 336, 337, 441, 338, 339, 340, 341, 587, 489, 487, 342, 669, 344, 345, 518, 690, 610, 346, 414, 538, 348, 349, 350, 351, 352, 353, 624, 354, 355, 530, 356, 357, 358, 699, 628, 493, 578, 359, 663, 653, 509, 360, 361, 362, 363, 364, 459, 365, 613, 366, 367, 670, 368, 369, 370, 631, 371, 417, 372, 565, 541, 612, 664, 566, 651, 600, 511, 645, 480, 616, 374, 375, 472, 520, 575, 606, 626, 377, 668, 469, 378, 546, 444, 379, 380, 381, 382, 583, 383, 384, 385, 527, 410, 386, 387, 388, 389, 390, 516, 391, 392, 393, 667, 633, 394, 395, 396, 684, 397, 398, 399]
diff --git a/configs/recognition/uniformerv2/metafile.yml b/configs/recognition/uniformerv2/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7172768e01903e60c42d400480336a9482e6e3cb
--- /dev/null
+++ b/configs/recognition/uniformerv2/metafile.yml
@@ -0,0 +1,466 @@
+Collections:
+- Name: UniFormerV2
+  README: configs/recognition/uniformerv2/README.md
+  Paper:
+    URL: https://arxiv.org/abs/2211.09552
+    Title: "UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer"
+
+Models:
+  - Name: uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-B/16
+      Batch Size: 32
+      Pretrained: CLIP-400M
+      Frame: 8
+      Sampling method: Uniform
+      Resolution: 224x224
+      Training Data: Kinetics-400
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 84.3
+        Top 5 Accuracy: 96.4
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth
+
+  - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-B/16
+      Batch Size: 32
+      Pretrained: Kinetics-710
+      Frame: 8
+      Sampling method: Uniform
+      Resolution: 224x224
+      Training Data: Kinetics-400
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 85.8
+        Top 5 Accuracy: 97.1
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth
+
+  - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14
+      Pretrained: Kinetics-710
+      Resolution: 224x224
+      Frame: 8
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 88.7
+        Top 5 Accuracy: 98.1
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth
+
+  - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14
+      Pretrained: Kinetics-710
+      Resolution: 224x224
+      Frame: 16
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 89.0
+        Top 5 Accuracy: 98.2
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth
+
+  - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14
+      Pretrained: Kinetics-710
+      Resolution: 224x224
+      Frame: 32
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 89.3
+        Top 5 Accuracy: 98.2
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth
+
+  - Name: uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14@336
+      Pretrained: Kinetics-710
+      Resolution: 224x224
+      Frame: 32
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 89.5
+        Top 5 Accuracy: 98.4
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth
+
+  - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-B/16
+      Pretrained: Kinetics-710
+      Frame: 8
+      Sampling method: Uniform
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-600
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 86.4
+        Top 5 Accuracy: 97.3
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth
+
+  - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14
+      Pretrained: Kinetics-710
+      Frame: 8
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-600
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 89.0
+        Top 5 Accuracy: 98.3
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth
+
+  - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14
+      Pretrained: Kinetics-710
+      Frame: 16
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-600
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 89.4
+        Top 5 Accuracy: 98.3
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth
+
+  - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14
+      Pretrained: Kinetics-710
+      Frame: 32
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-600
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 89.2
+        Top 5 Accuracy: 98.3
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth
+
+  - Name: uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14@336
+      Pretrained: Kinetics-710
+      Frame: 32
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-600
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 89.8
+        Top 5 Accuracy: 98.5
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth
+
+  - Name: uniformerv2-base-p16-res224_clip-pre_8xb32-u8_kinetics700-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_8xb32-u8_kinetics700-rgb.py
+    In Collection: UniFormer
+    Metadata:
+      Architecture: UniFormerV2-B/16
+      Pretrained: CLIP-400M
+      Frame: 8
+      Sampling method: Uniform
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-700
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 75.9
+        Top 5 Accuracy: 92.9
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth
+
+  - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py
+    In Collection: UniFormer
+    Metadata:
+      Architecture: UniFormerV2-B/16
+      Pretrained: Kinetics-710
+      Frame: 8
+      Sampling method: Uniform
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-700
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 76.3
+        Top 5 Accuracy: 92.9
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth
+
+  - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14
+      Pretrained: Kinetics-710
+      Frame: 8
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-700
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 80.8
+        Top 5 Accuracy: 95.2
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth
+
+  - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14
+      Pretrained: Kinetics-710
+      Frame: 16
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-700
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 81.2
+        Top 5 Accuracy: 95.6
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth
+
+  - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14
+      Pretrained: Kinetics-710
+      Frame: 32
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-700
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 81.4
+        Top 5 Accuracy: 95.7
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth
+
+  - Name: uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14@336
+      Pretrained: Kinetics-710
+      Frame: 32
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 82.1
+        Top 5 Accuracy: 96.0
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-bfb9f401.pth
+
+  - Name: uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-B/16
+      Pretrained: CLIP-400M
+      Frame: 8
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20230612-63cdbad9.pth
+
+  - Name: uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py
+    In Collection: UniFormer
+    Metadata:
+      Architecture: UniFormerV2-L/14
+      Pretrained: CLIP-400M
+      Frame: 8
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20230612-d002a407.pth
+
+  - Name: uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14@336
+      Pretrained: Kinetics-710
+      Frame: 8
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20230612-d723ddc1.pth
+
+  - Name: uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-B/16
+      Pretrained: Kinetics-710 + Kinetics-400
+      Frame: 8
+      Sampling method: Uniform
+      Training Resources: 16 GPUs
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Moments in Time V1
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 42.3
+        Top 5 Accuracy: 71.5
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth
+
+  - Name: uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14
+      Pretrained: Kinetics-710 + Kinetics-400
+      Frame: 8
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Moments in Time V1
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 47.0
+        Top 5 Accuracy: 76.1
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth
+
+  - Name: uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
+    In Collection: UniFormerV2
+    Metadata:
+      Architecture: UniFormerV2-L/14@336
+      Pretrained: Kinetics-710 + Kinetics-400
+      Frame: 8
+      Sampling method: Uniform
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Moments in Time V1
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 47.7
+        Top 5 Accuracy: 76.8
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..7899080d7bfe1cca72685d0d3304762e86082b8a
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py
@@ -0,0 +1,166 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth',  # noqa: E501
+            prefix='backbone.')),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=339,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/mit/videos/training'
+data_root_val = 'data/mit/videos/validation'
+ann_file_train = 'data/mit/mit_train_list_videos.txt'
+ann_file_val = 'data/mit/mit_val_list_videos.txt'
+ann_file_test = 'data/mit/mit_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 2e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1 / 20,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min_ratio=1 / 20,
+        by_epoch=True,
+        begin=5,
+        end=24,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=512)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b204cbff46e6b3ac3431561d61e26c0cd1965a9
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py
@@ -0,0 +1,174 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='backbone.')),
+    cls_head=dict(
+        type='UniFormerHead',
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=768,
+        average_clips='prob',
+        channel_map=  # noqa: E251
+        'configs/recognition/uniformerv2/k710_channel_map/map_k400.json',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='cls_head.')),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 2e-6
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=4,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=1,
+        end=5,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d02918194e4945de81392d19f1051ae1c038984
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py
@@ -0,0 +1,174 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='backbone.')),
+    cls_head=dict(
+        type='UniFormerHead',
+        dropout_ratio=0.5,
+        num_classes=600,
+        in_channels=768,
+        average_clips='prob',
+        channel_map=  # noqa: E251
+        'configs/recognition/uniformerv2/k710_channel_map/map_k600.json',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='cls_head.')),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics600/videos_train'
+data_root_val = 'data/kinetics600/videos_val'
+ann_file_train = 'data/kinetics600/kinetics600_train_list_videos.txt'
+ann_file_val = 'data/kinetics600/kinetics600_val_list_videos.txt'
+ann_file_test = 'data/kinetics600/kinetics600_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 2e-6
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=4,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=1,
+        end=5,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8051b3e55427912da037a2bbba5aa3271ecfa6
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py
@@ -0,0 +1,174 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='backbone.')),
+    cls_head=dict(
+        type='UniFormerHead',
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=768,
+        average_clips='prob',
+        channel_map=  # noqa: E251
+        'configs/recognition/uniformerv2/k710_channel_map/map_k700.json',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='cls_head.')),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics700/videos_train'
+data_root_val = 'data/kinetics700/videos_val'
+ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt'
+ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt'
+ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 2e-6
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=4,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=1,
+        end=5,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d9f77cde4a5a9eae713b8c937a8f1e1e8d26f80
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py
@@ -0,0 +1,37 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=710,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..646ca7846f532c2b6d1a33c63fa919cfa8a0e9d7
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py
@@ -0,0 +1,164 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=True,
+        pretrained='ViT-B/16'),
+    cls_head=dict(
+        type='UniFormerHead',
+        dropout_ratio=0.5,
+        num_classes=2,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data'
+data_root_val = 'data'
+ann_file_train = 'data/GenVidBench/label/fake_real_label/train.txt'
+ann_file_val = 'data/GenVidBench/label/fake_real_label/test.txt'
+ann_file_test = 'data/GenVidBench/label/fake_real_label/test.txt'
+
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=50,
+        eta_min_ratio=0.1,
+        by_epoch=True,
+        begin=5,
+        end=55,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b6db1923dcd09ff4d4bea2c4294cb878b00af4d
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py
@@ -0,0 +1,163 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=True,
+        pretrained='ViT-B/16'),
+    cls_head=dict(
+        type='UniFormerHead',
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics700/videos_train'
+data_root_val = 'data/kinetics700/videos_val'
+ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt'
+ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt'
+ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=50,
+        eta_min_ratio=0.1,
+        by_epoch=True,
+        begin=5,
+        end=55,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..da3f239e200f326084044633bacd02824b0bc7cc
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py
@@ -0,0 +1,209 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=True,
+        pretrained='ViT-B/16'),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=710,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+# dataset settings
+k400_data_root = 'data/kinetics400/videos_train'
+k600_data_root = 'data/kinetics600/videos'
+k700_data_root = 'data/kinetics700/videos'
+k400_data_root_val = 'data/kinetics400/videos_val'
+k600_data_root_val = k600_data_root
+k700_data_root_val = k700_data_root
+
+k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt'
+k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt'
+k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt'
+
+k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt'
+k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt'
+k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt'
+
+k400_trainset = dict(
+    type='VideoDataset',
+    ann_file=k400_ann_file_train,
+    data_prefix=dict(video=k400_data_root),
+    pipeline=train_pipeline)
+k600_trainset = dict(
+    type='VideoDataset',
+    ann_file=k600_ann_file_train,
+    data_prefix=dict(video=k600_data_root),
+    pipeline=train_pipeline)
+k700_trainset = dict(
+    type='VideoDataset',
+    ann_file=k700_ann_file_train,
+    data_prefix=dict(video=k700_data_root),
+    pipeline=train_pipeline)
+
+k400_valset = dict(
+    type='VideoDataset',
+    ann_file=k400_ann_file_val,
+    data_prefix=dict(video=k400_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k600_valset = dict(
+    type='VideoDataset',
+    ann_file=k600_ann_file_val,
+    data_prefix=dict(video=k600_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k700_valset = dict(
+    type='VideoDataset',
+    ann_file=k700_ann_file_val,
+    data_prefix=dict(video=k700_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+
+k400_testset = k400_valset.copy()
+k600_testset = k600_valset.copy()
+k700_testset = k700_valset.copy()
+k400_testset['pipeline'] = test_pipeline
+k600_testset['pipeline'] = test_pipeline
+k700_testset['pipeline'] = test_pipeline
+
+k710_trainset = dict(
+    type='ConcatDataset',
+    datasets=[k400_trainset, k600_trainset, k700_trainset])
+k710_valset = dict(
+    type='ConcatDataset', datasets=[k400_valset, k600_valset, k700_valset])
+k710_testset = dict(
+    type='ConcatDataset',
+    datasets=[k400_testset, k600_testset, k700_testset],
+)
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=k710_trainset)
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=k710_valset)
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=k710_testset)
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=50,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=5,
+        end=55,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_genvidbench.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_genvidbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..646ca7846f532c2b6d1a33c63fa919cfa8a0e9d7
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_genvidbench.py
@@ -0,0 +1,164 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=True,
+        pretrained='ViT-B/16'),
+    cls_head=dict(
+        type='UniFormerHead',
+        dropout_ratio=0.5,
+        num_classes=2,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data'
+data_root_val = 'data'
+ann_file_train = 'data/GenVidBench/label/fake_real_label/train.txt'
+ann_file_val = 'data/GenVidBench/label/fake_real_label/test.txt'
+ann_file_test = 'data/GenVidBench/label/fake_real_label/test.txt'
+
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=50,
+        eta_min_ratio=0.1,
+        by_epoch=True,
+        begin=5,
+        end=55,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..e90ab34c0986fd6c62c4722e8bc0022da0ebbe88
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 16
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k400'
+ann_file_test = 'data/k400/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5320a286804a56cf5b44aea6c2aa3fba4523b7f
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 16
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=600,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k600'
+ann_file_test = 'data/k600/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..249f02aa4c921fef0a15b39114f316cbe030812a
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 16
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k700'
+ann_file_test = 'data/k700/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1068b7d108330fe69c9301cc98b730b69e15d44
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 32
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k400'
+ann_file_test = 'data/k400/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ba4fab323a10ec97c3ba6434ac6efe01306f4d
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 32
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=600,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k600'
+ann_file_test = 'data/k600/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fd9758ce12aab2f1c4dc01fc4d03f195fd724ad
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 32
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k700'
+ann_file_test = 'data/k700/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0f9e7036c553f939d5a612dc9c2c02c12ad0eec
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k400'
+ann_file_test = 'data/k400/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a80137fa323b34ca37a1b9a8515a933f3bb5046
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=600,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k600'
+ann_file_test = 'data/k600/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7df26fd449061176092cfe445d01caa1ddd106f
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k700'
+ann_file_test = 'data/k700/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e0732b92be789155cee29e126b61aba9c3fa882
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py
@@ -0,0 +1,37 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=710,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a3a90ae2e00e9f82d039de852e776dc2e45fe06
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 32
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=336,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k400'
+ann_file_test = 'data/k400/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=2,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 336)),
+    dict(type='ThreeCrop', crop_size=336),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..b341643a082f6a3e990759d46ac8d55cdc890ca5
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 32
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=336,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=600,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k600'
+ann_file_test = 'data/k600/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=2,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 336)),
+    dict(type='ThreeCrop', crop_size=336),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..85a80bab0cc4c459ed13cb1c1c3eb7bde057706f
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 32
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=336,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k700'
+ann_file_test = 'data/k700/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=2,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 336)),
+    dict(type='ThreeCrop', crop_size=336),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..58e234bea7425a7bdb74028a72f32cb5b0641e80
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py
@@ -0,0 +1,37 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 32
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=336,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=710,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c15724589252c7f2ca18cf8ee26a0bbd2b52e35
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=339,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/mit_v1'
+ann_file_test = 'data/mit_v1/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=' '))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb8a6907feeebbca6367ec62cd92e8faeb8168b0
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
@@ -0,0 +1,70 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=336,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=339,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/mit_v1'
+ann_file_test = 'data/mit_v1/val.csv'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 336)),
+    dict(type='ThreeCrop', crop_size=336),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=' '))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/data/Pari1_sampled_dataset.zip b/data/Pari1_sampled_dataset.zip
new file mode 100644
index 0000000000000000000000000000000000000000..160bb81fccae815f636477b2ab384b01115b4707
--- /dev/null
+++ b/data/Pari1_sampled_dataset.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5eb37250480810a4f5557cdcf04d5d27be41ca519b1744e4cf944922a3c2463f
+size 2452248
diff --git a/dataset-index.yml b/dataset-index.yml
new file mode 100644
index 0000000000000000000000000000000000000000..de9f03728f0d47eff0f2a47ec55ddbbcda2ae0c4
--- /dev/null
+++ b/dataset-index.yml
@@ -0,0 +1,40 @@
+openxlab: true
+kinetics400:
+  dataset: OpenMMLab/Kinetics-400
+  download_root: data
+  data_root: data/kinetics400
+  script: tools/data/kinetics/preprocess_k400.sh
+
+kinetics600:
+  dataset: OpenMMLab/Kinetics600
+  download_root: data
+  data_root: data/kinetics600
+  script: tools/data/kinetics/preprocess_k600.sh
+
+kinetics700:
+  dataset: OpenMMLab/Kinetics_700
+  download_root: data
+  data_root: data/kinetics700
+  script: tools/data/kinetics/preprocess_k700.sh
+
+sthv2:
+  dataset: OpenDataLab/sthv2
+  download_root: data
+  data_root: data/sthv2
+  script: tools/data/sthv2/preprocess.sh
+
+ucf-101:
+  dataset: OpenDataLab/UCF101
+  download_root: data
+  data_root: data/ucf101
+
+finegym:
+  dataset: OpenDataLab/FineGym
+  download_root: data
+  data_root: data/gym
+
+diving48:
+  dataset: OpenDataLab/diving48
+  download_root: data
+  data_root: data/diving48
+  script: tools/data/diving48/preprocess.sh
diff --git a/demo/README.md b/demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..70b8b065b4950066b2f2cb59a8cb4311888d053c
--- /dev/null
+++ b/demo/README.md
@@ -0,0 +1,743 @@
+# Demo
+
+## Outline
+
+- [Modify configs through script arguments](#modify-config-through-script-arguments): Tricks to directly modify configs through script arguments.
+- [Video demo](#video-demo): A demo script to predict the recognition result using a single video.
+- [Video GradCAM Demo](#video-gradcam-demo): A demo script to visualize GradCAM results using a single video.
+- [Webcam demo](#webcam-demo): A demo script to implement real-time action recognition from a web camera.
+- [Long Video demo](#long-video-demo): a demo script to predict different labels using a single long video.
+- [Skeleton-based Action Recognition Demo](#skeleton-based-action-recognition-demo): A demo script to predict the skeleton-based action recognition result using a single video.
+- [SpatioTemporal Action Detection Webcam Demo](#spatiotemporal-action-detection-webcam-demo): A demo script to implement real-time spatio-temporal action detection from a web camera.
+- [SpatioTemporal Action Detection Video Demo](#spatiotemporal-action-detection-video-demo): A demo script to predict the spatiotemporal action detection result using a single video.
+- [SpatioTemporal Action Detection ONNX Video Demo](#spatiotemporal-action-detection-onnx-video-demo): A demo script to predict the SpatioTemporal Action Detection result using the onnx file instead of building the PyTorch models.
+- [Inferencer Demo](#inferencer): A demo script to implement fast predict for video analysis tasks based on unified inferencer interface.
+- [Audio Demo](#audio-demo): A demo script to predict the recognition result using a single audio file.
+- [Video Structuralize Demo](#video-structuralize-demo): A demo script to predict the skeleton-based and rgb-based action recognition and spatio-temporal action detection result using a single video.
+
+## Modify configs through script arguments
+
+When running demos using our provided scripts, you may specify `--cfg-options` to in-place modify the config.
+
+- Update config keys of dict.
+
+  The config options can be specified following the order of the dict keys in the original config.
+  For example, `--cfg-options model.backbone.norm_eval=False` changes the all BN modules in model backbones to `train` mode.
+
+- Update keys inside a list of configs.
+
+  Some config dicts are composed as a list in your config. For example, the training pipeline `train_dataloader.dataset.pipeline` is normally a list
+  e.g. `[dict(type='SampleFrames'), ...]`. If you want to change `'SampleFrames'` to `'DenseSampleFrames'` in the pipeline,
+  you may specify `--cfg-options train_dataloader.dataset.pipeline.0.type=DenseSampleFrames`.
+
+- Update values of list/tuples.
+
+  If the value to be updated is a list or a tuple. For example, the config file normally sets `workflow=[('train', 1)]`. If you want to
+  change this key, you may specify `--cfg-options workflow="[(train,1),(val,1)]"`. Note that the quotation mark " is necessary to
+  support list/tuple data types, and that **NO** white space is allowed inside the quotation marks in the specified value.
+
+## Video demo
+
+MMAction2 provides a demo script to predict the recognition result using a single video. In order to get predict results in range `[0, 1]`, make sure to set `model['test_cfg'] = dict(average_clips='prob')` in config file.
+
+```shell
+python demo/demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} ${LABEL_FILE} \
+    [--device ${DEVICE_TYPE}] [--fps ${FPS}] [--font-scale ${FONT_SCALE}] [--font-color ${FONT_COLOR}] \
+    [--target-resolution ${TARGET_RESOLUTION}] [--out-filename ${OUT_FILE}]
+```
+
+Optional arguments:
+
+- `--use-frames`: If specified, the demo will take rawframes as input. Otherwise, it will take a video as input.
+- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
+- `FPS`: FPS value of the output video when using rawframes as input. If not specified, it will be set to 30.
+- `FONT_SCALE`: Font scale of the text added in the video. If not specified, it will be None.
+- `FONT_COLOR`: Font color of the text added in the video. If not specified, it will be `white`.
+- `TARGET_RESOLUTION`: Resolution(desired_width, desired_height) for resizing the frames before output when using a video as input. If not specified, it will be None and the frames are resized by keeping the existing aspect ratio.
+- `OUT_FILE`: Path to the output file which can be a video format or gif format. If not specified, it will be set to `None` and does not generate the output file.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
+or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
+
+1. Recognize a video file as input by using a TSN model on cuda by default.
+
+   ```shell
+   # The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+   python demo/demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \
+       checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth \
+       demo/demo.mp4 tools/data/kinetics/label_map_k400.txt
+   ```
+
+2. Recognize a video file as input by using a TSN model on cuda by default, loading checkpoint from url.
+
+   ```shell
+   # The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+   python demo/demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \
+       https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth \
+       demo/demo.mp4 tools/data/kinetics/label_map_k400.txt
+   ```
+
+3. Recognize a video file as input by using a TSN model and then generate an mp4 file.
+
+   ```shell
+   # The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+   python demo/demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \
+       checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth \
+       demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --out-filename demo/demo_out.mp4
+   ```
+
+## Video GradCAM Demo
+
+MMAction2 provides a demo script to visualize GradCAM results using a single video.
+
+```shell
+python tools/visualizations/vis_cam.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} [--use-frames] \
+    [--device ${DEVICE_TYPE}] [--target-layer-name ${TARGET_LAYER_NAME}] [--fps {FPS}] \
+    [--target-resolution ${TARGET_RESOLUTION}] [--resize-algorithm {RESIZE_ALGORITHM}] [--out-filename {OUT_FILE}]
+```
+
+- `--use-frames`: If specified, the demo will take rawframes as input. Otherwise, it will take a video as input.
+- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
+- `FPS`: FPS value of the output video when using rawframes as input. If not specified, it will be set to 30.
+- `OUT_FILE`: Path to the output file which can be a video format or gif format. If not specified, it will be set to `None` and does not generate the output file.
+- `TARGET_LAYER_NAME`: Layer name to generate GradCAM localization map.
+- `TARGET_RESOLUTION`: Resolution(desired_width, desired_height) for resizing the frames before output when using a video as input. If not specified, it will be None and the frames are resized by keeping the existing aspect ratio.
+- `RESIZE_ALGORITHM`: Resize algorithm used for resizing. If not specified, it will be set to `bilinear`.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
+or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
+
+1. Get GradCAM results of a I3D model, using a video file as input and then generate an gif file with 10 fps.
+
+   ```shell
+   python tools/visualizations/vis_cam.py demo/demo_configs/i3d_r50_32x2x1_video_infer.py \
+       checkpoints/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth demo/demo.mp4 \
+       --target-layer-name backbone/layer4/1/relu --fps 10 \
+       --out-filename demo/demo_gradcam.gif
+   ```
+
+2. Get GradCAM results of a TSN model, using a video file as input and then generate an gif file, loading checkpoint from url.
+
+   ```shell
+   python tools/visualizations/vis_cam.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \
+       https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-dense-1x1x5-100e_kinetics400-rgb_20220906-dcbc6e01.pth \
+       demo/demo.mp4 --target-layer-name backbone/layer4/1/relu --out-filename demo/demo_gradcam_tsn.gif
+   ```
+
+## Webcam demo
+
+We provide a demo script to implement real-time action recognition from web camera. In order to get predict results in range `[0, 1]`, make sure to set `model.cls_head.average_clips='prob'` in config file.
+
+```shell
+python demo/webcam_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${LABEL_FILE} \
+    [--device ${DEVICE_TYPE}] [--camera-id ${CAMERA_ID}] [--threshold ${THRESHOLD}] \
+    [--average-size ${AVERAGE_SIZE}] [--drawing-fps ${DRAWING_FPS}] [--inference-fps ${INFERENCE_FPS}]
+```
+
+Optional arguments:
+
+- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
+- `CAMERA_ID`: ID of camera device If not specified, it will be set to 0.
+- `THRESHOLD`: Threshold of prediction score for action recognition. Only label with score higher than the threshold will be shown. If not specified, it will be set to 0.
+- `AVERAGE_SIZE`: Number of latest clips to be averaged for prediction. If not specified, it will be set to 1.
+- `DRAWING_FPS`: Upper bound FPS value of the output drawing. If not specified, it will be set to 20.
+- `INFERENCE_FPS`: Upper bound FPS value of the output drawing. If not specified, it will be set to 4.
+
+If your hardware is good enough, increasing the value of `DRAWING_FPS` and `INFERENCE_FPS` will get a better experience.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
+or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
+
+1. Recognize the action from web camera as input by using a TSN model on cpu, averaging the score per 5 times
+   and outputting result labels with score higher than 0.2.
+
+   ```shell
+   python demo/webcam_demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \
+     checkpoints/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth tools/data/kinetics/label_map_k400.txt --average-size 5 \
+     --threshold 0.2 --device cpu
+   ```
+
+2. Recognize the action from web camera as input by using a TSN model on cpu, averaging the score per 5 times
+   and outputting result labels with score higher than 0.2, loading checkpoint from url.
+
+   ```shell
+   python demo/webcam_demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \
+     https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth \
+     tools/data/kinetics/label_map_k400.txt --average-size 5 --threshold 0.2 --device cpu
+   ```
+
+3. Recognize the action from web camera as input by using a I3D model on gpu by default, averaging the score per 5 times
+   and outputting result labels with score higher than 0.2.
+
+   ```shell
+   python demo/webcam_demo.py demo/demo_configs/i3d_r50_32x2x1_video_infer.py \
+     checkpoints/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth tools/data/kinetics/label_map_k400.txt \
+     --average-size 5 --threshold 0.2
+   ```
+
+Considering the efficiency difference for users' hardware, Some modifications might be done to suit the case.
+Users can change:
+
+- `SampleFrames` step (especially the number of `clip_len` and `num_clips`) of `test_pipeline` in the config file, like `--cfg-options test_pipeline.0.num_clips=3`.
+- Change to the suitable Crop methods like `TenCrop`, `ThreeCrop`, `CenterCrop`, etc. in `test_pipeline` of the config file, like `--cfg-options test_pipeline.4.type=CenterCrop`.
+- Change the number of `--average-size`. The smaller, the faster.
+
+## Long video demo
+
+We provide a demo script to predict different labels using a single long video. In order to get predict results in range `[0, 1]`, make sure to set `cls_head = dict(average_clips='prob')` in config file.
+
+```shell
+python demo/long_video_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} ${LABEL_FILE} \
+    ${OUT_FILE} [--input-step ${INPUT_STEP}] [--device ${DEVICE_TYPE}] [--threshold ${THRESHOLD}]
+```
+
+Optional arguments:
+
+- `OUT_FILE`: Path to the output, either video or json file
+- `INPUT_STEP`: Input step for sampling frames, which can help to get more spare input. If not specified , it will be set to 1.
+- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
+- `THRESHOLD`: Threshold of prediction score for action recognition. Only label with score higher than the threshold will be shown. If not specified, it will be set to 0.01.
+- `STRIDE`: By default, the demo generates a prediction for each single frame, which might cost lots of time. To speed up, you can set the argument `STRIDE` and then the demo will generate a prediction every `STRIDE x sample_length` frames (`sample_length` indicates the size of temporal window from which you sample frames, which equals to `clip_len x frame_interval`). For example, if the sample_length is 64 frames and you set `STRIDE` to 0.5, predictions will be generated every 32 frames. If set as 0, predictions will be generated for each frame. The desired value of `STRIDE` is (0, 1\], while it also works for `STRIDE > 1` (the generated predictions will be too sparse). Default: 0.
+- `LABEL_COLOR`: Font Color of the labels in (B, G, R). Default is white, that is (256, 256, 256).
+- `MSG_COLOR`: Font Color of the messages in (B, G, R). Default is gray, that is (128, 128, 128).
+
+Examples:
+
+Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
+or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
+
+1. Predict different labels in a long video by using a TSN model on cpu, with 8 frames for input steps (that is, random sample one from each 3 frames)
+   and outputting result labels with score higher than 0.2.
+
+   ```shell
+   python demo/long_video_demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \
+     checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO \
+     --input-step 3 --device cpu --threshold 0.2
+   ```
+
+2. Predict different labels in a long video by using a TSN model on cpu, with 8 frames for input steps (that is, random sample one from each 3 frames)
+   and outputting result labels with score higher than 0.2, loading checkpoint from url.
+
+   ```shell
+   python demo/long_video_demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \
+     https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+     PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO --input-step 3 --device cpu --threshold 0.2
+   ```
+
+3. Predict different labels in a long video from web by using a TSN model on cpu, with 8 frames for input steps (that is, random sample one from each 3 frames)
+   and outputting result labels with score higher than 0.2, loading checkpoint from url.
+
+   ```shell
+   python demo/long_video_demo.py demo/demo_configs/tsn_r50_1x1x8_video_infer.py \
+     https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+     https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-mp4-file.mp4 \
+     tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO --input-step 3 --device cpu --threshold 0.2
+   ```
+
+4. Predict different labels in a long video by using a I3D model on gpu, with input_step=1, threshold=0.01 as default and print the labels in cyan.
+
+   ```shell
+   python demo/long_video_demo.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py \
+     checkpoints/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO \
+     --label-color 255 255 0
+   ```
+
+5. Predict different labels in a long video by using a I3D model on gpu and save the results as a `json` file
+
+   ```shell
+   python demo/long_video_demo.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py \
+     checkpoints/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt ./results.json
+   ```
+
+## Skeleton-based Action Recognition Demo
+
+MMAction2 provides a demo script to predict the skeleton-based action recognition result using a single video.
+
+```shell
+python demo/demo_skeleton.py ${VIDEO_FILE} ${OUT_FILENAME} \
+    [--config ${SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \
+    [--checkpoint ${SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT}] \
+    [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
+    [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
+    [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
+    [--det-cat-id ${HUMAN_DETECTION_CATEGORY_ID}] \
+    [--pose-config ${HUMAN_POSE_ESTIMATION_CONFIG_FILE}] \
+    [--pose-checkpoint ${HUMAN_POSE_ESTIMATION_CHECKPOINT}] \
+    [--label-map ${LABEL_MAP}] \
+    [--device ${DEVICE}] \
+    [--short-side] ${SHORT_SIDE}
+```
+
+Optional arguments:
+
+- `SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE`: The skeleton-based action recognition config file path.
+- `SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT`: The skeleton-based action recognition checkpoint path or url.
+- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
+- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint path or url.
+- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Defaults to 0.9.
+- `HUMAN_DETECTION_CATEGORY_ID`: The category id for human detection. Defaults to 0.
+- `HUMAN_POSE_ESTIMATION_CONFIG_FILE`: The human pose estimation config file path (trained on COCO-Keypoint).
+- `HUMAN_POSE_ESTIMATION_CHECKPOINT`: The human pose estimation checkpoint path or url (trained on COCO-Keypoint).
+- `LABEL_MAP`: The label map used. Defaults to `'tools/data/skeleton/label_map_ntu60.txt'`.
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `'cuda:0'` or `'cpu'`. Defaults to `'cuda:0'`.
+- `SHORT_SIDE`: The short side used for frame extraction. Defaults to 480.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` .
+
+1. Use the Faster-RCNN as the human detector, HRNetw32 as the pose estimator, PoseC3D-NTURGB+D-60-XSub-Keypoint as the skeleton-based action recognizer.
+
+```shell
+python demo/demo_skeleton.py demo/demo_skeleton.mp4 demo/demo_skeleton_out.mp4 \
+    --config configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py \
+    --checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/slowonly_r50_u48_240e_ntu60_xsub_keypoint/slowonly_r50_u48_240e_ntu60_xsub_keypoint-f3adabf1.pth \
+    --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --det-score-thr 0.9 \
+    --det-cat-id 0 \
+    --pose-config demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \
+    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
+    --label-map tools/data/skeleton/label_map_ntu60.txt
+```
+
+2. Use the Faster-RCNN as the human detector, HRNetw32 as the pose estimator, STGCN-NTURGB+D-60-XSub-Keypoint as the skeleton-based action recognizer.
+
+```shell
+python demo/demo_skeleton.py demo/demo_skeleton.mp4 demo/demo_skeleton_out.mp4 \
+    --config configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py \
+    --checkpoint https://download.openmmlab.com/mmaction/v1.0/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20221129-484a394a.pth \
+    --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --det-score-thr 0.9 \
+    --det-cat-id 0 \
+    --pose-config demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \
+    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
+    --label-map tools/data/skeleton/label_map_ntu60.txt
+```
+
+## SpatioTemporal Action Detection Webcam Demo
+
+We provide a demo script to implement real-time spatio-temporal action detection from a web camera.
+
+```shell
+python demo/webcam_demo_spatiotemporal_det.py \
+    [--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
+    [--checkpoint ${SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
+    [--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \
+    [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
+    [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
+    [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
+    [--input-video] ${INPUT_VIDEO} \
+    [--label-map ${LABEL_MAP}] \
+    [--device ${DEVICE}] \
+    [--output-fps ${OUTPUT_FPS}] \
+    [--out-filename ${OUTPUT_FILENAME}] \
+    [--show] \
+    [--display-height] ${DISPLAY_HEIGHT} \
+    [--display-width] ${DISPLAY_WIDTH} \
+    [--predict-stepsize ${PREDICT_STEPSIZE}] \
+    [--clip-vis-length] ${CLIP_VIS_LENGTH}
+```
+
+Optional arguments:
+
+- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path.
+- `SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT`: The spatiotemporal action detection checkpoint path or URL.
+- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Default: 0.4.
+- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
+- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
+- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Default: 0.9.
+- `INPUT_VIDEO`: The webcam id or video path of the source. Default: `0`.
+- `LABEL_MAP`: The label map used. Default: `tools/data/ava/label_map.txt`.
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`.  Default: `cuda:0`.
+- `OUTPUT_FPS`: The FPS of demo video output. Default: 15.
+- `OUTPUT_FILENAME`: Path to the output file which is a video format. Default: None.
+- `--show`: Whether to show predictions with `cv2.imshow`.
+- `DISPLAY_HEIGHT`: The height of the display frame. Default: 0.
+- `DISPLAY_WIDTH`: The width of the display frame. Default: 0. If `DISPLAY_HEIGHT <= 0 and DISPLAY_WIDTH <= 0`, the display frame and input video share the same shape.
+- `PREDICT_STEPSIZE`: Make a prediction per N frames. Default: 8.
+- `CLIP_VIS_LENGTH`: The number of the draw frames for each clip. In other words, for each clip, there are at most `CLIP_VIS_LENGTH` frames to be draw around the keyframe. DEFAULT: 8.
+
+Tips to get a better experience for webcam demo:
+
+- How to choose `--output-fps`?
+
+  - `--output-fps` should be almost equal to read thread fps.
+  - Read thread fps is printed by logger in format `DEBUG:__main__:Read Thread: {duration} ms, {fps} fps`
+
+- How to choose `--predict-stepsize`?
+
+  - It's related to how to choose human detector and spatio-temporval model.
+  - Overall, the duration of read thread for each task should be greater equal to that of model inference.
+  - The durations for read/inference are both printed by logger.
+  - Larger `--predict-stepsize` leads to larger duration for read thread.
+  - In order to fully take the advantage of computation resources, decrease the value of `--predict-stepsize`.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` .
+
+1. Use the Faster RCNN as the human detector, SlowOnly-8x8-R101 as the action detector. Making predictions per 40 frames, and FPS of the output is 20. Show predictions with `cv2.imshow`.
+
+```shell
+python demo/webcam_demo_spatiotemporal_det.py \
+    --input-video 0 \
+    --config configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \
+    --checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
+    --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --det-score-thr 0.9 \
+    --action-score-thr 0.5 \
+    --label-map tools/data/ava/label_map.txt \
+    --predict-stepsize 40 \
+    --output-fps 20 \
+    --show
+```
+
+## SpatioTemporal Action Detection Video Demo
+
+MMAction2 provides a demo script to predict the SpatioTemporal Action Detection result using a single video.
+
+```shell
+python demo/demo_spatiotemporal_det.py --video ${VIDEO_FILE} \
+    [--out-filename ${OUTPUT_FILENAME}] \
+    [--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
+    [--checkpoint ${SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
+    [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
+    [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
+    [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
+    [--det-cat-id ${HUMAN_DETECTION_CATEGORY_ID}] \
+    [--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \
+    [--label-map ${LABEL_MAP}] \
+    [--device ${DEVICE}] \
+    [--short-side] ${SHORT_SIDE} \
+    [--predict-stepsize ${PREDICT_STEPSIZE}] \
+    [--output-stepsize ${OUTPUT_STEPSIZE}] \
+    [--output-fps ${OUTPUT_FPS}]
+```
+
+Optional arguments:
+
+- `OUTPUT_FILENAME`: Path to the output file which is a video format. Defaults to `demo/stdet_demo.mp4`.
+- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path.
+- `SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT`: The spatiotemporal action detection checkpoint URL.
+- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
+- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
+- `HUMAN_DETECTION_SCORE_THRESHOLD`: The score threshold for human detection. Defaults to 0.9.
+- `HUMAN_DETECTION_CATEGORY_ID`: The category id for human detection. Defaults to 0.
+- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Defaults to 0.5.
+- `LABEL_MAP`: The label map used. Defaults to `tools/data/ava/label_map.txt`.
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`.  Defaults to `cuda:0`.
+- `SHORT_SIDE`: The short side used for frame extraction. Defaults to 256.
+- `PREDICT_STEPSIZE`: Make a prediction per N frames.  Defaults to 8.
+- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Defaults to 4.
+- `OUTPUT_FPS`: The FPS of demo video output. Defaults to 6.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` .
+
+1. Use the Faster RCNN as the human detector, SlowOnly-8x8-R101 as the action detector. Making predictions per 8 frames, and output 1 frame per 4 frames to the output video. The FPS of the output video is 4.
+
+```shell
+python demo/demo_spatiotemporal_det.py demo/demo.mp4 demo/demo_spatiotemporal_det.mp4 \
+    --config configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \
+    --checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
+    --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --det-score-thr 0.9 \
+    --action-score-thr 0.5 \
+    --label-map tools/data/ava/label_map.txt \
+    --predict-stepsize 8 \
+    --output-stepsize 4 \
+    --output-fps 6
+```
+
+## SpatioTemporal Action Detection ONNX Video Demo
+
+MMAction2 provides a demo script to predict the SpatioTemporal Action Detection result using the onnx file instead of building the PyTorch models.
+
+```shell
+python demo/demo_spatiotemporal_det_onnx.py --video ${VIDEO_FILE} \
+    [--out-filename ${OUTPUT_FILENAME}] \
+    [--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
+    [--onnx-file ${SPATIOTEMPORAL_ACTION_DETECTION_ONNX_FILE}] \
+    [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
+    [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
+    [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
+    [--det-cat-id ${HUMAN_DETECTION_CATEGORY_ID}] \
+    [--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \
+    [--label-map ${LABEL_MAP}] \
+    [--device ${DEVICE}] \
+    [--short-side] ${SHORT_SIDE} \
+    [--predict-stepsize ${PREDICT_STEPSIZE}] \
+    [--output-stepsize ${OUTPUT_STEPSIZE}] \
+    [--output-fps ${OUTPUT_FPS}]
+```
+
+Optional arguments:
+
+- `OUTPUT_FILENAME`: Path to the output file which is a video format. Defaults to `demo/stdet_demo.mp4`.
+- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path.
+- `SPATIOTEMPORAL_ACTION_DETECTION_ONNX_FILE`: The spatiotemporal action detection onnx file.
+- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
+- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
+- `HUMAN_DETECTION_SCORE_THRESHOLD`: The score threshold for human detection. Defaults to 0.9.
+- `HUMAN_DETECTION_CATEGORY_ID`: The category id for human detection. Defaults to 0.
+- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Defaults to 0.5.
+- `LABEL_MAP`: The label map used. Defaults to `tools/data/ava/label_map.txt`.
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`.  Defaults to `cuda:0`.
+- `SHORT_SIDE`: The short side used for frame extraction. Defaults to 256.
+- `PREDICT_STEPSIZE`: Make a prediction per N frames.  Defaults to 8.
+- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Defaults to 4.
+- `OUTPUT_FPS`: The FPS of demo video output. Defaults to 6.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` .
+
+1. Export an onnx file given the config file and checkpoint.
+
+```shell
+python tools/deployment/export_onnx_stdet.py \
+    configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \
+    https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
+    --output_file slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx \
+    --num_frames 8
+```
+
+2. Use the Faster RCNN as the human detector, the generated `slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx` file as the action detector. Making predictions per 8 frames, and output 1 frame per 4 frames to the output video. The FPS of the output video is 4.
+
+```shell
+python demo/demo_spatiotemporal_det_onnx.py demo/demo.mp4 demo/demo_spatiotemporal_det.mp4 \
+    --config configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \
+    --onnx-file slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx \
+    --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --det-score-thr 0.9 \
+    --action-score-thr 0.5 \
+    --label-map tools/data/ava/label_map.txt \
+    --predict-stepsize 8 \
+    --output-stepsize 4 \
+    --output-fps 6
+```
+
+## Inferencer
+
+MMAction2 provides a demo script to implement fast prediction for video analysis tasks based on unified inferencer interface, currently only supports action recognition task.
+
+```shell
+python demo/demo.py ${INPUTS} \
+    [--vid-out-dir ${VID_OUT_DIR}] \
+    [--rec ${RECOG_TASK}] \
+    [--rec-weights ${RECOG_WEIGHTS}] \
+    [--label-file ${LABEL_FILE}] \
+    [--device ${DEVICE_TYPE}] \
+    [--batch-size ${BATCH_SIZE}] \
+    [--print-result ${PRINT_RESULT}] \
+    [--pred-out-file ${PRED_OUT_FILE} ]
+```
+
+Optional arguments:
+
+- `--show`: If specified, the demo will display the video in a popup window.
+- `--print-result`: If specified, the demo will print the inference results'
+- `VID_OUT_DIR`: Output directory of saved videos. Defaults to None, means not to save videos.
+- `RECOG_TASK`: Type of Action Recognition algorithm. It could be the path to the config file, the model name or alias defined in metafile.
+- `RECOG_WEIGHTS`: Path to the custom checkpoint file of the selected recog model. If it is not specified and "rec" is a model name of metafile, the weights will be loaded from metafile.
+- `LABEL_FILE`: Label file for dataset the algorithm pretrained on. Defaults to None, means don't show label in result.
+- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. Defaults to `cuda:0`.
+- `BATCH_SIZE`: The batch size used in inference. Defaults to 1.
+- `PRED_OUT_FILE`: File path to save the inference results. Defaults to None, means not to save prediction results.
+
+Examples:
+
+Assume that you are located at `$MMACTION2`.
+
+1. Recognize a video file as input by using a TSN model, loading checkpoint from metafile.
+
+   ```shell
+   # The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+   python demo/demo_inferencer.py demo/demo.mp4 \
+       --rec tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb \
+       --label-file tools/data/kinetics/label_map_k400.txt
+   ```
+
+2. Recognize a video file as input by using a TSN model, using model alias in metafile.
+
+   ```shell
+   # The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+   python demo/demo_inferencer.py demo/demo.mp4 \
+       --rec tsn \
+       --label-file tools/data/kinetics/label_map_k400.txt
+   ```
+
+3. Recognize a video file as input by using a TSN model, and then save visulization video.
+
+   ```shell
+   # The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+   python demo/demo_inferencer.py demo/demo.mp4 \
+       --vid-out-dir demo_out \
+       --rec tsn \
+       --label-file tools/data/kinetics/label_map_k400.txt
+   ```
+
+## Audio Demo
+
+Demo script to predict the audio-based action recognition using a single audio feature.
+
+The script [`extract_audio.py`](/tools/data/extract_audio.py) can be used to extract audios from videos and the script [`build_audio_features.py`](/tools/data/build_audio_features.py) can be used to extract the audio features.
+
+```shell
+python demo/demo_audio.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${AUDIO_FILE} {LABEL_FILE} [--device ${DEVICE}]
+```
+
+Optional arguments:
+
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda devices like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
+or use checkpoint url from `configs/` to directly load the corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
+
+1. Recognize an audio file as input by using a tsn model on cuda by default.
+
+   ```shell
+   python demo/demo_audio.py \
+       configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature.py \
+       https://download.openmmlab.com/mmaction/v1.0/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio-feature_20230702-e4642fb0.pth \
+       audio_feature.npy tools/data/kinetics/label_map_k400.txt
+   ```
+
+## Video Structuralize Demo
+
+We provide a demo script to predict the skeleton-based and rgb-based action recognition and spatio-temporal action detection result using a single video.
+
+```shell
+python demo/demo_video_structuralize.py \
+    [--rgb-stdet-config ${RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
+    [--rgb-stdet-checkpoint ${RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
+    [--skeleton-stdet-checkpoint ${SKELETON_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
+    [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
+    [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
+    [--pose-config ${HUMAN_POSE_ESTIMATION_CONFIG_FILE}] \
+    [--pose-checkpoint ${HUMAN_POSE_ESTIMATION_CHECKPOINT}] \
+    [--skeleton-config ${SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \
+    [--skeleton-checkpoint ${SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT}] \
+    [--rgb-config ${RGB_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \
+    [--rgb-checkpoint ${RGB_BASED_ACTION_RECOGNITION_CHECKPOINT}] \
+    [--use-skeleton-stdet ${USE_SKELETON_BASED_SPATIO_TEMPORAL_DETECTION_METHOD}] \
+    [--use-skeleton-recog ${USE_SKELETON_BASED_ACTION_RECOGNITION_METHOD}] \
+    [--det-score-thr ${HUMAN_DETECTION_SCORE_THRE}] \
+    [--action-score-thr ${ACTION_DETECTION_SCORE_THRE}] \
+    [--video ${VIDEO_FILE}] \
+    [--label-map-stdet ${LABEL_MAP_FOR_SPATIO_TEMPORAL_ACTION_DETECTION}] \
+    [--device ${DEVICE}] \
+    [--out-filename ${OUTPUT_FILENAME}] \
+    [--predict-stepsize ${PREDICT_STEPSIZE}] \
+    [--output-stepsize ${OUTPU_STEPSIZE}] \
+    [--output-fps ${OUTPUT_FPS}] \
+    [--cfg-options]
+```
+
+Optional arguments:
+
+- `RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The rgb-based spatio temoral action detection config file path.
+- `RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT`: The rgb-based spatio temoral action detection checkpoint path or URL.
+- `SKELETON_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT`: The skeleton-based spatio temoral action detection checkpoint path or URL.
+- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
+- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
+- `HUMAN_POSE_ESTIMATION_CONFIG_FILE`: The human pose estimation config file path (trained on COCO-Keypoint).
+- `HUMAN_POSE_ESTIMATION_CHECKPOINT`: The human pose estimation checkpoint URL (trained on COCO-Keypoint).
+- `SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE`: The skeleton-based action recognition config file path.
+- `SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT`: The skeleton-based action recognition checkpoint path or URL.
+- `RGB_BASED_ACTION_RECOGNITION_CONFIG_FILE`: The rgb-based action recognition config file path.
+- `RGB_BASED_ACTION_RECOGNITION_CHECKPOINT`: The rgb-based action recognition checkpoint path or URL.
+- `USE_SKELETON_BASED_SPATIO_TEMPORAL_DETECTION_METHOD`: Use skeleton-based spatio temporal action detection method.
+- `USE_SKELETON_BASED_ACTION_RECOGNITION_METHOD`: Use skeleton-based action recognition method.
+- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Default: 0.9.
+- `ACTION_DETECTION_SCORE_THRE`: The score threshold for action detection. Default: 0.4.
+- `LABEL_MAP_FOR_SPATIO_TEMPORAL_ACTION_DETECTION`: The label map for spatio temporal action detection used. Default: `tools/data/ava/label_map.txt`.
+- `LABEL_MAP`: The label map for action recognition. Default: `tools/data/kinetics/label_map_k400.txt`.
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`.  Default: `cuda:0`.
+- `OUTPUT_FILENAME`: Path to the output file which is a video format. Default: `demo/test_stdet_recognition_output.mp4`.
+- `PREDICT_STEPSIZE`: Make a prediction per N frames.  Default: 8.
+- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Default: 1.
+- `OUTPUT_FPS`: The FPS of demo video output. Default: 24.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` .
+
+1. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, PoseC3D as the skeleton-based action recognizer and the skeleton-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24.
+
+```shell
+python demo/demo_video_structuralize.py \
+    --skeleton-stdet-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth \
+    --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --pose-config demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \
+    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
+    --skeleton-config configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py \
+    --skeleton-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_k400.pth \
+    --use-skeleton-stdet \
+    --use-skeleton-recog \
+    --label-map-stdet tools/data/ava/label_map.txt \
+    --label-map tools/data/kinetics/label_map_k400.txt
+```
+
+2. Use the Faster RCNN as the human detector, TSN-R50-1x1x3 as the rgb-based action recognizer, SlowOnly-8x8-R101 as the rgb-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24.
+
+```shell
+python demo/demo_video_structuralize.py \
+    --rgb-stdet-config configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \
+    --rgb-stdet-checkpoint  https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
+    --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --rgb-config demo/demo_configs/tsn_r50_1x1x8_video_infer.py \
+    --rgb-checkpoint https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+    --label-map-stdet tools/data/ava/label_map.txt \
+    --label-map tools/data/kinetics/label_map_k400.txt
+```
+
+3. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, PoseC3D as the skeleton-based action recognizer, SlowOnly-8x8-R101 as the rgb-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24.
+
+```shell
+python demo/demo_video_structuralize.py \
+    --rgb-stdet-config configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \
+    --rgb-stdet-checkpoint  https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
+    --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --pose-config demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \
+    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
+    --skeleton-config configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py \
+    --skeleton-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_k400.pth \
+    --use-skeleton-recog \
+    --label-map-stdet tools/data/ava/label_map.txt \
+    --label-map tools/data/kinetics/label_map_k400.txt
+```
+
+4. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, TSN-R50-1x1x3 as the rgb-based action recognizer, PoseC3D as the skeleton-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24.
+
+```shell
+python demo/demo_video_structuralize.py
+    --skeleton-stdet-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth \
+    --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --pose-config demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \
+    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
+    --skeleton-config configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py \
+    --rgb-config demo/demo_configs/tsn_r50_1x1x8_video_infer.py \
+    --rgb-checkpoint https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+    --use-skeleton-stdet \
+    --label-map-stdet tools/data/ava/label_map.txt \
+    --label-map tools/data/kinetics/label_map_k400.txt
+```
diff --git a/demo/demo.ipynb b/demo/demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..509d1852151a49ab24ba64de21d1a44c2f43c7a2
--- /dev/null
+++ b/demo/demo.ipynb
@@ -0,0 +1,138 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from operator import itemgetter\n",
+    "from mmaction.apis import init_recognizer, inference_recognizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "config_file = '../demo/demo_configs/tsn_r50_1x1x8_video_infer.py'\n",
+    "# download the checkpoint from model zoo and put it in `checkpoints/`\n",
+    "checkpoint_file = '../checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loads checkpoint by local backend from path: ../checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "# build the model from a config file and a checkpoint file\n",
+    "model = init_recognizer(config_file, checkpoint_file, device='cpu')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# test a single video and show the result:\n",
+    "video = 'demo.mp4'\n",
+    "label = '../tools/data/kinetics/label_map_k400.txt'\n",
+    "results = inference_recognizer(model, video)\n",
+    "\n",
+    "pred_scores = results.pred_score.tolist()\n",
+    "score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))\n",
+    "score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)\n",
+    "top5_label = score_sorted[:5]\n",
+    "\n",
+    "labels = open(label).readlines()\n",
+    "labels = [x.strip() for x in labels]\n",
+    "results = [(labels[k[0]], k[1]) for k in top5_label]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "arm wrestling:  1.0\n",
+      "rock scissors paper:  1.698846019067312e-15\n",
+      "massaging feet:  5.157996544393221e-16\n",
+      "stretching leg:  1.018867278715779e-16\n",
+      "bench pressing:  7.110452486439706e-17\n"
+     ]
+    }
+   ],
+   "source": [
+    "# show the results\n",
+    "for result in results:\n",
+    "    print(f'{result[0]}: ', result[1])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mmact_dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.13 (default, Mar 29 2022, 02:18:16) \n[GCC 7.5.0]"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "189c342a4747645665e89db23000ac4d4edb7a87c4cd0b2f881610f468fb778d"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/demo/demo.mp4 b/demo/demo.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..cec78ef29eba4f72493a94797869bcf9b61a827b
--- /dev/null
+++ b/demo/demo.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:071e60535eaf0aed475ddac06269ee0cdfc4740158f22d9ccd2c3b93b42aa344
+size 635539
diff --git a/demo/demo.py b/demo/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..d536831ed22442328001e22a75b22b46421ed985
--- /dev/null
+++ b/demo/demo.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from operator import itemgetter
+from typing import Optional, Tuple
+
+from mmengine import Config, DictAction
+
+from mmaction.apis import inference_recognizer, init_recognizer
+from mmaction.visualization import ActionVisualizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 demo')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file/url')
+    parser.add_argument('video', help='video file/url or rawframes directory')
+    parser.add_argument('label', help='label file')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--fps',
+        default=30,
+        type=int,
+        help='specify fps value of the output video when using rawframes to '
+        'generate file')
+    parser.add_argument(
+        '--font-scale',
+        default=None,
+        type=float,
+        help='font scale of the text in output video')
+    parser.add_argument(
+        '--font-color',
+        default='white',
+        help='font color of the text in output video')
+    parser.add_argument(
+        '--target-resolution',
+        nargs=2,
+        default=None,
+        type=int,
+        help='Target resolution (w, h) for resizing the frames when using a '
+        'video as input. If either dimension is set to -1, the frames are '
+        'resized by keeping the existing aspect ratio')
+    parser.add_argument('--out-filename', default=None, help='output filename')
+    args = parser.parse_args()
+    return args
+
+
+def get_output(
+    video_path: str,
+    out_filename: str,
+    data_sample: str,
+    labels: list,
+    fps: int = 30,
+    font_scale: Optional[str] = None,
+    font_color: str = 'white',
+    target_resolution: Optional[Tuple[int]] = None,
+) -> None:
+    """Get demo output using ``moviepy``.
+
+    This function will generate video file or gif file from raw video or
+    frames, by using ``moviepy``. For more information of some parameters,
+    you can refer to: https://github.com/Zulko/moviepy.
+
+    Args:
+        video_path (str): The video file path.
+        out_filename (str): Output filename for the generated file.
+        datasample (str): Predicted label of the generated file.
+        labels (list): Label list of current dataset.
+        fps (int): Number of picture frames to read per second. Defaults to 30.
+        font_scale (float): Font scale of the text. Defaults to None.
+        font_color (str): Font color of the text. Defaults to ``white``.
+        target_resolution (Tuple[int], optional): Set to
+            (desired_width desired_height) to have resized frames. If
+            either dimension is None, the frames are resized by keeping
+            the existing aspect ratio. Defaults to None.
+    """
+
+    if video_path.startswith(('http://', 'https://')):
+        raise NotImplementedError
+
+    # init visualizer
+    out_type = 'gif' if osp.splitext(out_filename)[1] == '.gif' else 'video'
+    visualizer = ActionVisualizer()
+    visualizer.dataset_meta = dict(classes=labels)
+
+    text_cfg = {'colors': font_color}
+    if font_scale is not None:
+        text_cfg.update({'font_sizes': font_scale})
+
+    visualizer.add_datasample(
+        out_filename,
+        video_path,
+        data_sample,
+        draw_pred=True,
+        draw_gt=False,
+        text_cfg=text_cfg,
+        fps=fps,
+        out_type=out_type,
+        out_path=osp.join('demo', out_filename),
+        target_resolution=target_resolution)
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # Build the recognizer from a config file and checkpoint file/url
+    model = init_recognizer(cfg, args.checkpoint, device=args.device)
+    pred_result = inference_recognizer(model, args.video)
+
+    pred_scores = pred_result.pred_score.tolist()
+    score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))
+    score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)
+    top5_label = score_sorted[:5]
+
+    labels = open(args.label).readlines()
+    labels = [x.strip() for x in labels]
+    results = [(labels[k[0]], k[1]) for k in top5_label]
+
+    print('The top-5 labels with corresponding scores are:')
+    for result in results:
+        print(f'{result[0]}: ', result[1])
+
+    if args.out_filename is not None:
+
+        if args.target_resolution is not None:
+            if args.target_resolution[0] == -1:
+                assert isinstance(args.target_resolution[1], int)
+                assert args.target_resolution[1] > 0
+            if args.target_resolution[1] == -1:
+                assert isinstance(args.target_resolution[0], int)
+                assert args.target_resolution[0] > 0
+            args.target_resolution = tuple(args.target_resolution)
+
+        get_output(
+            args.video,
+            args.out_filename,
+            pred_result,
+            labels,
+            fps=args.fps,
+            font_scale=args.font_scale,
+            font_color=args.font_color,
+            target_resolution=args.target_resolution)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/demo_audio.py b/demo/demo_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc6724afc44d059e4e8fbfe9c4f8adee721e1ec8
--- /dev/null
+++ b/demo/demo_audio.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from operator import itemgetter
+
+import torch
+from mmengine import Config, DictAction
+
+from mmaction.apis import inference_recognizer, init_recognizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 demo')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file/url')
+    parser.add_argument('audio', help='audio file')
+    parser.add_argument('label', help='label file')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    device = torch.device(args.device)
+    cfg = Config.fromfile(args.config)
+    cfg.merge_from_dict(args.cfg_options)
+    model = init_recognizer(cfg, args.checkpoint, device=device)
+
+    if not args.audio.endswith('.npy'):
+        raise NotImplementedError('Demo works on extracted audio features')
+    pred_result = inference_recognizer(model, args.audio)
+
+    pred_scores = pred_result.pred_score.tolist()
+    score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))
+    score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)
+    top5_label = score_sorted[:5]
+
+    labels = open(args.label).readlines()
+    labels = [x.strip() for x in labels]
+    results = [(labels[k[0]], k[1]) for k in top5_label]
+
+    print('The top-5 labels with corresponding scores are:')
+    for result in results:
+        print(f'{result[0]}: ', result[1])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py b/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6cd685881d885ef25dcb5f1ef98e329cf334869
--- /dev/null
+++ b/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+model = dict(
+    type='FasterRCNN',
+    _scope_='mmdet',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.53, 116.28, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[0.0, 0.0, 0.0, 0.0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=1,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0.0, 0.0, 0.0, 0.0],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+file_client_args = dict(backend='disk')
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='mmdet.Resize', scale=(1333, 800), keep_ratio=True),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))
diff --git a/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py b/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0e851abadc79ffb5821cc9e27b099e990e156a1
--- /dev/null
+++ b/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# model settings
+model = dict(
+    type='FasterRCNN',
+    _scope_='mmdet',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+file_client_args = dict(backend='disk')
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='mmdet.Resize', scale=(1333, 800), keep_ratio=True),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline))
diff --git a/demo/demo_configs/i3d_r50_32x2x1_rawframes_infer.py b/demo/demo_configs/i3d_r50_32x2x1_rawframes_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..54259066695fd6081343833b756c9d1184f5dc2f
--- /dev/null
+++ b/demo/demo_configs/i3d_r50_32x2x1_rawframes_infer.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = ['../../configs/_base_/models/i3d_r50.py']
+
+# dataset settings
+dataset_type = 'RawframeDataset'
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    dataset=dict(
+        type=dataset_type,
+        ann_file=None,
+        data_prefix=None,
+        pipeline=test_pipeline))
diff --git a/demo/demo_configs/i3d_r50_32x2x1_video_infer.py b/demo/demo_configs/i3d_r50_32x2x1_video_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe3b11b819f2afc1ff1cefb0b6861fb7b7d84261
--- /dev/null
+++ b/demo/demo_configs/i3d_r50_32x2x1_video_infer.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = ['../../configs/_base_/models/i3d_r50.py']
+
+# dataset settings
+dataset_type = 'VideoDataset'
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    dataset=dict(
+        type=dataset_type,
+        ann_file=None,
+        data_prefix=None,
+        pipeline=test_pipeline))
diff --git a/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py b/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e662527ef1d6d002e749e1963e2c7ec885ad2305
--- /dev/null
+++ b/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# codec settings
+codec = dict(
+    type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    _scope_='mmpose',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmpose'
+            '/pretrain_models/hrnet_w32-36af842e.pth'),
+    ),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=32,
+        out_channels=17,
+        deconv_out_channels=None,
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec),
+    test_cfg=dict(
+        flip_test=True,
+        flip_mode='heatmap',
+        shift_heatmap=True,
+    ))
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+file_client_args = dict(backend='disk')
+test_pipeline = [
+    dict(type='mmpose.LoadImage', file_client_args=file_client_args),
+    dict(type='mmpose.GetBBoxCenterScale'),
+    dict(type='mmpose.TopdownAffine', input_size=codec['input_size']),
+    dict(type='mmpose.PackPoseInputs')
+]
+test_dataloader = dict(
+    batch_size=32,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='annotations/person_keypoints_val2017.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+    ))
+
+# visualizer
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='mmpose.PoseLocalVisualizer',
+    vis_backends=vis_backends,
+    name='visualizer')
diff --git a/demo/demo_configs/tsn_r50_1x1x8_rawframes_infer.py b/demo/demo_configs/tsn_r50_1x1x8_rawframes_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de4667a94185d8308e0834f3c8bc171fc4515f4
--- /dev/null
+++ b/demo/demo_configs/tsn_r50_1x1x8_rawframes_infer.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = ['../../configs/_base_/models/tsn_r50.py']
+
+# dataset settings
+dataset_type = 'RawframeDataset'
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    dataset=dict(
+        type=dataset_type,
+        ann_file=None,
+        data_prefix=None,
+        pipeline=test_pipeline))
diff --git a/demo/demo_configs/tsn_r50_1x1x8_video_infer.py b/demo/demo_configs/tsn_r50_1x1x8_video_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a256294c0e47efe6df8869835787b8bc24f06f4
--- /dev/null
+++ b/demo/demo_configs/tsn_r50_1x1x8_video_infer.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = ['../../configs/_base_/models/tsn_r50.py']
+
+# dataset settings
+dataset_type = 'VideoDataset'
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    dataset=dict(
+        type=dataset_type,
+        ann_file=None,
+        data_prefix=None,
+        pipeline=test_pipeline))
diff --git a/demo/demo_inferencer.py b/demo/demo_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..82aa9b840e9becf4c0f2a839e26e7c5dbf7a9c7f
--- /dev/null
+++ b/demo/demo_inferencer.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+
+from mmaction.apis.inferencers import MMAction2Inferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        'inputs', type=str, help='Input video file or rawframes folder path.')
+    parser.add_argument(
+        '--vid-out-dir',
+        type=str,
+        default='',
+        help='Output directory of videos.')
+    parser.add_argument(
+        '--rec',
+        type=str,
+        default=None,
+        help='Pretrained action recognition algorithm. It\'s the path to the '
+        'config file or the model name defined in metafile.')
+    parser.add_argument(
+        '--rec-weights',
+        type=str,
+        default=None,
+        help='Path to the custom checkpoint file of the selected recog model. '
+        'If it is not specified and "rec" is a model name of metafile, the '
+        'weights will be loaded from metafile.')
+    parser.add_argument(
+        '--label-file', type=str, default=None, help='label file for dataset.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default=None,
+        help='Device used for inference. '
+        'If not specified, the available device will be automatically used.')
+    parser.add_argument(
+        '--batch-size', type=int, default=1, help='Inference batch size.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Display the video in a popup window.')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    parser.add_argument(
+        '--pred-out-file',
+        type=str,
+        default='',
+        help='File to save the inference results.')
+
+    call_args = vars(parser.parse_args())
+
+    init_kws = ['rec', 'rec_weights', 'device', 'label_file']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    return init_args, call_args
+
+
+def main():
+    init_args, call_args = parse_args()
+    mmaction2 = MMAction2Inferencer(**init_args)
+    mmaction2(**call_args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/demo_skeleton.py b/demo/demo_skeleton.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac59b73b9b373dc8b2b4ef1734c8de8b669e21bb
--- /dev/null
+++ b/demo/demo_skeleton.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+
+import cv2
+import mmcv
+import mmengine
+import torch
+from mmengine import DictAction
+from mmengine.utils import track_iter_progress
+
+from mmaction.apis import (detection_inference, inference_skeleton,
+                           init_recognizer, pose_inference)
+from mmaction.registry import VISUALIZERS
+from mmaction.utils import frame_extract
+
+try:
+    import moviepy.editor as mpy
+except ImportError:
+    raise ImportError('Please install moviepy to enable output file')
+
+FONTFACE = cv2.FONT_HERSHEY_DUPLEX
+FONTSCALE = 0.75
+FONTCOLOR = (255, 255, 255)  # BGR, white
+THICKNESS = 1
+LINETYPE = 1
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 demo')
+    parser.add_argument('video', help='video file/url')
+    parser.add_argument('out_filename', help='output filename')
+    parser.add_argument(
+        '--config',
+        default=('configs/skeleton/posec3d/'
+                 'slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py'),
+        help='skeleton model config file path')
+    parser.add_argument(
+        '--checkpoint',
+        default=('https://download.openmmlab.com/mmaction/skeleton/posec3d/'
+                 'slowonly_r50_u48_240e_ntu60_xsub_keypoint/'
+                 'slowonly_r50_u48_240e_ntu60_xsub_keypoint-f3adabf1.pth'),
+        help='skeleton model checkpoint file/url')
+    parser.add_argument(
+        '--det-config',
+        default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py',
+        help='human detection config file path (from mmdet)')
+    parser.add_argument(
+        '--det-checkpoint',
+        default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
+                 'faster_rcnn_r50_fpn_2x_coco/'
+                 'faster_rcnn_r50_fpn_2x_coco_'
+                 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
+        help='human detection checkpoint file/url')
+    parser.add_argument(
+        '--det-score-thr',
+        type=float,
+        default=0.9,
+        help='the threshold of human detection score')
+    parser.add_argument(
+        '--det-cat-id',
+        type=int,
+        default=0,
+        help='the category id for human detection')
+    parser.add_argument(
+        '--pose-config',
+        default='demo/demo_configs/'
+        'td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py',
+        help='human pose estimation config file path (from mmpose)')
+    parser.add_argument(
+        '--pose-checkpoint',
+        default=('https://download.openmmlab.com/mmpose/top_down/hrnet/'
+                 'hrnet_w32_coco_256x192-c78dce93_20200708.pth'),
+        help='human pose estimation checkpoint file/url')
+    parser.add_argument(
+        '--label-map',
+        default='tools/data/skeleton/label_map_ntu60.txt',
+        help='label map file')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--short-side',
+        type=int,
+        default=480,
+        help='specify the short-side length of the image')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    args = parser.parse_args()
+    return args
+
+
+def visualize(args, frames, data_samples, action_label):
+    pose_config = mmengine.Config.fromfile(args.pose_config)
+    visualizer = VISUALIZERS.build(pose_config.visualizer)
+    visualizer.set_dataset_meta(data_samples[0].dataset_meta)
+
+    vis_frames = []
+    print('Drawing skeleton for each frame')
+    for d, f in track_iter_progress(list(zip(data_samples, frames))):
+        f = mmcv.imconvert(f, 'bgr', 'rgb')
+        visualizer.add_datasample(
+            'result',
+            f,
+            data_sample=d,
+            draw_gt=False,
+            draw_heatmap=False,
+            draw_bbox=True,
+            show=False,
+            wait_time=0,
+            out_file=None,
+            kpt_thr=0.3)
+        vis_frame = visualizer.get_image()
+        cv2.putText(vis_frame, action_label, (10, 30), FONTFACE, FONTSCALE,
+                    FONTCOLOR, THICKNESS, LINETYPE)
+        vis_frames.append(vis_frame)
+
+    vid = mpy.ImageSequenceClip(vis_frames, fps=24)
+    vid.write_videofile(args.out_filename, remove_temp=True)
+
+
+def main():
+    args = parse_args()
+
+    tmp_dir = tempfile.TemporaryDirectory()
+    frame_paths, frames = frame_extract(args.video, args.short_side,
+                                        tmp_dir.name)
+
+    h, w, _ = frames[0].shape
+
+    # Get Human detection results.
+    det_results, _ = detection_inference(args.det_config, args.det_checkpoint,
+                                         frame_paths, args.det_score_thr,
+                                         args.det_cat_id, args.device)
+    torch.cuda.empty_cache()
+
+    # Get Pose estimation results.
+    pose_results, pose_data_samples = pose_inference(args.pose_config,
+                                                     args.pose_checkpoint,
+                                                     frame_paths, det_results,
+                                                     args.device)
+    torch.cuda.empty_cache()
+
+    config = mmengine.Config.fromfile(args.config)
+    config.merge_from_dict(args.cfg_options)
+
+    model = init_recognizer(config, args.checkpoint, args.device)
+    result = inference_skeleton(model, pose_results, (h, w))
+
+    max_pred_index = result.pred_score.argmax().item()
+    label_map = [x.strip() for x in open(args.label_map).readlines()]
+    action_label = label_map[max_pred_index]
+
+    visualize(args, frames, pose_data_samples, action_label)
+
+    tmp_dir.cleanup()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/demo_spatiotemporal_det.py b/demo/demo_spatiotemporal_det.py
new file mode 100644
index 0000000000000000000000000000000000000000..aebf6e673a7a3c500287ea6f71584819c62aa005
--- /dev/null
+++ b/demo/demo_spatiotemporal_det.py
@@ -0,0 +1,375 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy as cp
+import tempfile
+
+import cv2
+import mmcv
+import mmengine
+import numpy as np
+import torch
+from mmengine import DictAction
+from mmengine.runner import load_checkpoint
+from mmengine.structures import InstanceData
+
+from mmaction.apis import detection_inference
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.utils import frame_extract, get_str_type
+
+try:
+    import moviepy.editor as mpy
+except ImportError:
+    raise ImportError('Please install moviepy to enable output file')
+
+FONTFACE = cv2.FONT_HERSHEY_DUPLEX
+FONTSCALE = 0.5
+FONTCOLOR = (255, 255, 255)  # BGR, white
+MSGCOLOR = (128, 128, 128)  # BGR, gray
+THICKNESS = 1
+LINETYPE = 1
+
+
+def hex2color(h):
+    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
+    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
+
+
+plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
+plate_blue = plate_blue.split('-')
+plate_blue = [hex2color(h) for h in plate_blue]
+plate_green = '004b23-006400-007200-008000-38b000-70e000'
+plate_green = plate_green.split('-')
+plate_green = [hex2color(h) for h in plate_green]
+
+
+def visualize(frames, annotations, plate=plate_blue, max_num=5):
+    """Visualize frames with predicted annotations.
+
+    Args:
+        frames (list[np.ndarray]): Frames for visualization, note that
+            len(frames) % len(annotations) should be 0.
+        annotations (list[list[tuple]]): The predicted results.
+        plate (str): The plate used for visualization. Default: plate_blue.
+        max_num (int): Max number of labels to visualize for a person box.
+            Default: 5.
+    Returns:
+        list[np.ndarray]: Visualized frames.
+    """
+
+    assert max_num + 1 <= len(plate)
+    plate = [x[::-1] for x in plate]
+    frames_out = cp.deepcopy(frames)
+    nf, na = len(frames), len(annotations)
+    assert nf % na == 0
+    nfpa = len(frames) // len(annotations)
+    anno = None
+    h, w, _ = frames[0].shape
+    scale_ratio = np.array([w, h, w, h])
+    for i in range(na):
+        anno = annotations[i]
+        if anno is None:
+            continue
+        for j in range(nfpa):
+            ind = i * nfpa + j
+            frame = frames_out[ind]
+            for ann in anno:
+                box = ann[0]
+                label = ann[1]
+                if not len(label):
+                    continue
+                score = ann[2]
+                box = (box * scale_ratio).astype(np.int64)
+                st, ed = tuple(box[:2]), tuple(box[2:])
+                cv2.rectangle(frame, st, ed, plate[0], 2)
+                for k, lb in enumerate(label):
+                    if k >= max_num:
+                        break
+                    text = abbrev(lb)
+                    text = ': '.join([text, f'{score[k]:>.2f}'])
+                    location = (0 + st[0], 18 + k * 18 + st[1])
+                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
+                                               THICKNESS)[0]
+                    textwidth = textsize[0]
+                    diag0 = (location[0] + textwidth, location[1] - 14)
+                    diag1 = (location[0], location[1] + 2)
+                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
+                    cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                                FONTCOLOR, THICKNESS, LINETYPE)
+
+    return frames_out
+
+
+def load_label_map(file_path):
+    """Load Label Map.
+
+    Args:
+        file_path (str): The file path of label map.
+    Returns:
+        dict: The label map (int -> label name).
+    """
+    lines = open(file_path).readlines()
+    lines = [x.strip().split(': ') for x in lines]
+    return {int(x[0]): x[1] for x in lines}
+
+
+def abbrev(name):
+    """Get the abbreviation of label name:
+
+    'take (an object) from (a person)' -> 'take ... from ...'
+    """
+    while name.find('(') != -1:
+        st, ed = name.find('('), name.find(')')
+        name = name[:st] + '...' + name[ed + 1:]
+    return name
+
+
+def pack_result(human_detection, result, img_h, img_w):
+    """Short summary.
+
+    Args:
+        human_detection (np.ndarray): Human detection result.
+        result (type): The predicted label of each human proposal.
+        img_h (int): The image height.
+        img_w (int): The image width.
+    Returns:
+        tuple: Tuple of human proposal, label name and label score.
+    """
+    human_detection[:, 0::2] /= img_w
+    human_detection[:, 1::2] /= img_h
+    results = []
+    if result is None:
+        return None
+    for prop, res in zip(human_detection, result):
+        res.sort(key=lambda x: -x[1])
+        results.append(
+            (prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
+                                                            for x in res]))
+    return results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 demo')
+    parser.add_argument('video', help='video file/url')
+    parser.add_argument('out_filename', help='output filename')
+    parser.add_argument(
+        '--config',
+        default=('configs/detection/slowonly/slowonly_kinetics400-pretrained-'
+                 'r101_8xb16-8x8x1-20e_ava21-rgb.py'),
+        help='spatialtemporal detection model config file path')
+    parser.add_argument(
+        '--checkpoint',
+        default=('https://download.openmmlab.com/mmaction/detection/ava/'
+                 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/'
+                 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_'
+                 '20201217-16378594.pth'),
+        help='spatialtemporal detection model checkpoint file/url')
+    parser.add_argument(
+        '--det-config',
+        default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py',
+        help='human detection config file path (from mmdet)')
+    parser.add_argument(
+        '--det-checkpoint',
+        default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
+                 'faster_rcnn_r50_fpn_2x_coco/'
+                 'faster_rcnn_r50_fpn_2x_coco_'
+                 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
+        help='human detection checkpoint file/url')
+    parser.add_argument(
+        '--det-score-thr',
+        type=float,
+        default=0.9,
+        help='the threshold of human detection score')
+    parser.add_argument(
+        '--det-cat-id',
+        type=int,
+        default=0,
+        help='the category id for human detection')
+    parser.add_argument(
+        '--action-score-thr',
+        type=float,
+        default=0.5,
+        help='the threshold of human action score')
+    parser.add_argument(
+        '--label-map',
+        default='tools/data/ava/label_map.txt',
+        help='label map file')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--short-side',
+        type=int,
+        default=256,
+        help='specify the short-side length of the image')
+    parser.add_argument(
+        '--predict-stepsize',
+        default=8,
+        type=int,
+        help='give out a prediction per n frames')
+    parser.add_argument(
+        '--output-stepsize',
+        default=4,
+        type=int,
+        help=('show one frame per n frames in the demo, we should have: '
+              'predict_stepsize % output_stepsize == 0'))
+    parser.add_argument(
+        '--output-fps',
+        default=6,
+        type=int,
+        help='the fps of demo video output')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    tmp_dir = tempfile.TemporaryDirectory()
+    frame_paths, original_frames = frame_extract(
+        args.video, out_dir=tmp_dir.name)
+    num_frame = len(frame_paths)
+    h, w, _ = original_frames[0].shape
+
+    # resize frames to shortside
+    new_w, new_h = mmcv.rescale_size((w, h), (args.short_side, np.Inf))
+    frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
+    w_ratio, h_ratio = new_w / w, new_h / h
+
+    # Get clip_len, frame_interval and calculate center index of each clip
+    config = mmengine.Config.fromfile(args.config)
+    config.merge_from_dict(args.cfg_options)
+    val_pipeline = config.val_pipeline
+
+    sampler = [
+        x for x in val_pipeline if get_str_type(x['type']) == 'SampleAVAFrames'
+    ][0]
+    clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
+    window_size = clip_len * frame_interval
+    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+    # Note that it's 1 based here
+    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
+                           args.predict_stepsize)
+
+    # Load label_map
+    label_map = load_label_map(args.label_map)
+    try:
+        if config['data']['train']['custom_classes'] is not None:
+            label_map = {
+                id + 1: label_map[cls]
+                for id, cls in enumerate(config['data']['train']
+                                         ['custom_classes'])
+            }
+    except KeyError:
+        pass
+
+    # Get Human detection results
+    center_frames = [frame_paths[ind - 1] for ind in timestamps]
+
+    human_detections, _ = detection_inference(args.det_config,
+                                              args.det_checkpoint,
+                                              center_frames,
+                                              args.det_score_thr,
+                                              args.det_cat_id, args.device)
+    torch.cuda.empty_cache()
+    for i in range(len(human_detections)):
+        det = human_detections[i]
+        det[:, 0:4:2] *= w_ratio
+        det[:, 1:4:2] *= h_ratio
+        human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
+
+    # Build STDET model
+    try:
+        # In our spatiotemporal detection demo, different actions should have
+        # the same number of bboxes.
+        config['model']['test_cfg']['rcnn'] = dict(action_thr=0)
+    except KeyError:
+        pass
+
+    config.model.backbone.pretrained = None
+    model = MODELS.build(config.model)
+
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+    model.to(args.device)
+    model.eval()
+
+    predictions = []
+
+    img_norm_cfg = dict(
+        mean=np.array(config.model.data_preprocessor.mean),
+        std=np.array(config.model.data_preprocessor.std),
+        to_rgb=False)
+
+    print('Performing SpatioTemporal Action Detection for each clip')
+    assert len(timestamps) == len(human_detections)
+    prog_bar = mmengine.ProgressBar(len(timestamps))
+    for timestamp, proposal in zip(timestamps, human_detections):
+        if proposal.shape[0] == 0:
+            predictions.append(None)
+            continue
+
+        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
+        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
+        frame_inds = list(frame_inds - 1)
+        imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
+        _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
+        # THWC -> CTHW -> 1CTHW
+        input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
+        input_tensor = torch.from_numpy(input_array).to(args.device)
+
+        datasample = ActionDataSample()
+        datasample.proposals = InstanceData(bboxes=proposal)
+        datasample.set_metainfo(dict(img_shape=(new_h, new_w)))
+        with torch.no_grad():
+            result = model(input_tensor, [datasample], mode='predict')
+            scores = result[0].pred_instances.scores
+            prediction = []
+            # N proposals
+            for i in range(proposal.shape[0]):
+                prediction.append([])
+            # Perform action score thr
+            for i in range(scores.shape[1]):
+                if i not in label_map:
+                    continue
+                for j in range(proposal.shape[0]):
+                    if scores[j, i] > args.action_score_thr:
+                        prediction[j].append((label_map[i], scores[j,
+                                                                   i].item()))
+            predictions.append(prediction)
+        prog_bar.update()
+
+    results = []
+    for human_detection, prediction in zip(human_detections, predictions):
+        results.append(pack_result(human_detection, prediction, new_h, new_w))
+
+    def dense_timestamps(timestamps, n):
+        """Make it nx frames."""
+        old_frame_interval = (timestamps[1] - timestamps[0])
+        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+        new_frame_inds = np.arange(
+            len(timestamps) * n) * old_frame_interval / n + start
+        return new_frame_inds.astype(np.int64)
+
+    dense_n = int(args.predict_stepsize / args.output_stepsize)
+    frames = [
+        cv2.imread(frame_paths[i - 1])
+        for i in dense_timestamps(timestamps, dense_n)
+    ]
+    print('Performing visualization')
+    vis_frames = visualize(frames, results)
+    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
+                                fps=args.output_fps)
+    vid.write_videofile(args.out_filename)
+
+    tmp_dir.cleanup()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/demo_spatiotemporal_det_onnx.py b/demo/demo_spatiotemporal_det_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ee2ff14966951667667469cef6fbfc49fb1e493
--- /dev/null
+++ b/demo/demo_spatiotemporal_det_onnx.py
@@ -0,0 +1,358 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy as cp
+import tempfile
+
+import cv2
+import mmcv
+import mmengine
+import numpy as np
+import onnxruntime
+import torch
+from mmdet.structures.bbox import bbox2roi
+from mmengine import DictAction
+
+from mmaction.apis import detection_inference
+from mmaction.utils import frame_extract, get_str_type
+
+try:
+    import moviepy.editor as mpy
+except ImportError:
+    raise ImportError('Please install moviepy to enable output file')
+
+FONTFACE = cv2.FONT_HERSHEY_DUPLEX
+FONTSCALE = 0.5
+FONTCOLOR = (255, 255, 255)  # BGR, white
+MSGCOLOR = (128, 128, 128)  # BGR, gray
+THICKNESS = 1
+LINETYPE = 1
+
+
+def hex2color(h):
+    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
+    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
+
+
+plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
+plate_blue = plate_blue.split('-')
+plate_blue = [hex2color(h) for h in plate_blue]
+plate_green = '004b23-006400-007200-008000-38b000-70e000'
+plate_green = plate_green.split('-')
+plate_green = [hex2color(h) for h in plate_green]
+
+
+def visualize(frames, annotations, plate=plate_blue, max_num=5):
+    """Visualize frames with predicted annotations.
+
+    Args:
+        frames (list[np.ndarray]): Frames for visualization, note that
+            len(frames) % len(annotations) should be 0.
+        annotations (list[list[tuple]]): The predicted results.
+        plate (str): The plate used for visualization. Default: plate_blue.
+        max_num (int): Max number of labels to visualize for a person box.
+            Default: 5.
+    Returns:
+        list[np.ndarray]: Visualized frames.
+    """
+
+    assert max_num + 1 <= len(plate)
+    plate = [x[::-1] for x in plate]
+    frames_out = cp.deepcopy(frames)
+    nf, na = len(frames), len(annotations)
+    assert nf % na == 0
+    nfpa = len(frames) // len(annotations)
+    anno = None
+    h, w, _ = frames[0].shape
+    scale_ratio = np.array([w, h, w, h])
+    for i in range(na):
+        anno = annotations[i]
+        if anno is None:
+            continue
+        for j in range(nfpa):
+            ind = i * nfpa + j
+            frame = frames_out[ind]
+            for ann in anno:
+                box = ann[0]
+                label = ann[1]
+                if not len(label):
+                    continue
+                score = ann[2]
+                box = (box * scale_ratio).astype(np.int64)
+                st, ed = tuple(box[:2]), tuple(box[2:])
+                cv2.rectangle(frame, st, ed, plate[0], 2)
+                for k, lb in enumerate(label):
+                    if k >= max_num:
+                        break
+                    text = abbrev(lb)
+                    text = ': '.join([text, str(score[k])])
+                    location = (0 + st[0], 18 + k * 18 + st[1])
+                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
+                                               THICKNESS)[0]
+                    textwidth = textsize[0]
+                    diag0 = (location[0] + textwidth, location[1] - 14)
+                    diag1 = (location[0], location[1] + 2)
+                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
+                    cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                                FONTCOLOR, THICKNESS, LINETYPE)
+
+    return frames_out
+
+
+def load_label_map(file_path):
+    """Load Label Map.
+
+    Args:
+        file_path (str): The file path of label map.
+    Returns:
+        dict: The label map (int -> label name).
+    """
+    lines = open(file_path).readlines()
+    lines = [x.strip().split(': ') for x in lines]
+    return {int(x[0]): x[1] for x in lines}
+
+
+def abbrev(name):
+    """Get the abbreviation of label name:
+
+    'take (an object) from (a person)' -> 'take ... from ...'
+    """
+    while name.find('(') != -1:
+        st, ed = name.find('('), name.find(')')
+        name = name[:st] + '...' + name[ed + 1:]
+    return name
+
+
+def pack_result(human_detection, result, img_h, img_w):
+    """Short summary.
+
+    Args:
+        human_detection (np.ndarray): Human detection result.
+        result (type): The predicted label of each human proposal.
+        img_h (int): The image height.
+        img_w (int): The image width.
+    Returns:
+        tuple: Tuple of human proposal, label name and label score.
+    """
+    human_detection[:, 0::2] /= img_w
+    human_detection[:, 1::2] /= img_h
+    results = []
+    if result is None:
+        return None
+    for prop, res in zip(human_detection, result):
+        res.sort(key=lambda x: -x[1])
+        results.append(
+            (prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
+                                                            for x in res]))
+    return results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 demo')
+    parser.add_argument('video', help='video file/url')
+    parser.add_argument('out_filename', help='output filename')
+    parser.add_argument(
+        '--config',
+        default=('configs/detection/slowonly/slowonly_k700-pre'
+                 '-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py'),
+        help='spatialtemporal detection model config file path')
+    parser.add_argument(
+        '--onnx-file', help='spatialtemporal detection onnx file path')
+
+    parser.add_argument(
+        '--det-config',
+        default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py',
+        help='human detection config file path (from mmdet)')
+    parser.add_argument(
+        '--det-checkpoint',
+        default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
+                 'faster_rcnn_r50_fpn_2x_coco/'
+                 'faster_rcnn_r50_fpn_2x_coco_'
+                 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
+        help='human detection checkpoint file/url')
+    parser.add_argument(
+        '--det-score-thr',
+        type=float,
+        default=0.9,
+        help='the threshold of human detection score')
+    parser.add_argument(
+        '--det-cat-id',
+        type=int,
+        default=0,
+        help='the category id for human detection')
+    parser.add_argument(
+        '--action-score-thr',
+        type=float,
+        default=0.5,
+        help='the threshold of human action score')
+    parser.add_argument(
+        '--label-map',
+        default='tools/data/ava/label_map.txt',
+        help='label map file')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--short-side',
+        type=int,
+        default=256,
+        help='specify the short-side length of the image')
+    parser.add_argument(
+        '--predict-stepsize',
+        default=8,
+        type=int,
+        help='give out a prediction per n frames')
+    parser.add_argument(
+        '--output-stepsize',
+        default=4,
+        type=int,
+        help=('show one frame per n frames in the demo, we should have: '
+              'predict_stepsize % output_stepsize == 0'))
+    parser.add_argument(
+        '--output-fps',
+        default=6,
+        type=int,
+        help='the fps of demo video output')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    tmp_dir = tempfile.TemporaryDirectory()
+    frame_paths, original_frames = frame_extract(
+        args.video, out_dir=tmp_dir.name)
+    num_frame = len(frame_paths)
+    h, w, _ = original_frames[0].shape
+
+    # resize frames to shortside
+    new_w, new_h = mmcv.rescale_size((w, h), (args.short_side, np.Inf))
+    frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
+    w_ratio, h_ratio = new_w / w, new_h / h
+
+    # Get clip_len, frame_interval and calculate center index of each clip
+    config = mmengine.Config.fromfile(args.config)
+    config.merge_from_dict(args.cfg_options)
+    val_pipeline = config.val_pipeline
+
+    sampler = [
+        x for x in val_pipeline if get_str_type(x['type']) == 'SampleAVAFrames'
+    ][0]
+    clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
+    window_size = clip_len * frame_interval
+    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+    # Note that it's 1 based here
+    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
+                           args.predict_stepsize)
+
+    # Load label_map
+    label_map = load_label_map(args.label_map)
+    try:
+        if config['data']['train']['custom_classes'] is not None:
+            label_map = {
+                id + 1: label_map[cls]
+                for id, cls in enumerate(config['data']['train']
+                                         ['custom_classes'])
+            }
+    except KeyError:
+        pass
+
+    # Get Human detection results
+    center_frames = [frame_paths[ind - 1] for ind in timestamps]
+
+    human_detections, _ = detection_inference(args.det_config,
+                                              args.det_checkpoint,
+                                              center_frames,
+                                              args.det_score_thr,
+                                              args.det_cat_id, args.device)
+    torch.cuda.empty_cache()
+    for i in range(len(human_detections)):
+        det = human_detections[i]
+        det[:, 0:4:2] *= w_ratio
+        det[:, 1:4:2] *= h_ratio
+        human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
+
+    # Build STDET model
+    session = onnxruntime.InferenceSession(args.onnx_file)
+
+    predictions = []
+
+    img_norm_cfg = dict(
+        mean=np.array(config.model.data_preprocessor.mean),
+        std=np.array(config.model.data_preprocessor.std),
+        to_rgb=False)
+
+    print('Performing SpatioTemporal Action Detection for each clip')
+    assert len(timestamps) == len(human_detections)
+    prog_bar = mmengine.ProgressBar(len(timestamps))
+    for timestamp, proposal in zip(timestamps, human_detections):
+        if proposal.shape[0] == 0:
+            predictions.append(None)
+            continue
+
+        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
+        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
+        frame_inds = list(frame_inds - 1)
+        imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
+        _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
+        # THWC -> CTHW -> 1CTHW
+        input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
+        rois = bbox2roi([proposal])
+
+        input_feed = {
+            'input_tensor': input_array,
+            'rois': rois.cpu().data.numpy()
+        }
+        outputs = session.run(['cls_score'], input_feed=input_feed)
+        logits = outputs[0]
+        scores = 1 / (1 + np.exp(-logits))
+
+        prediction = []
+        # N proposals
+        for i in range(proposal.shape[0]):
+            prediction.append([])
+        # Perform action score thr
+        for i in range(scores.shape[1]):
+            if i not in label_map:
+                continue
+            for j in range(proposal.shape[0]):
+                if scores[j, i] > args.action_score_thr:
+                    prediction[j].append((label_map[i], scores[j, i].item()))
+        predictions.append(prediction)
+        prog_bar.update()
+
+    results = []
+    for human_detection, prediction in zip(human_detections, predictions):
+        results.append(pack_result(human_detection, prediction, new_h, new_w))
+
+    def dense_timestamps(timestamps, n):
+        """Make it nx frames."""
+        old_frame_interval = (timestamps[1] - timestamps[0])
+        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+        new_frame_inds = np.arange(
+            len(timestamps) * n) * old_frame_interval / n + start
+        return new_frame_inds.astype(np.int64)
+
+    dense_n = int(args.predict_stepsize / args.output_stepsize)
+    frames = [
+        cv2.imread(frame_paths[i - 1])
+        for i in dense_timestamps(timestamps, dense_n)
+    ]
+    print('Performing visualization')
+    vis_frames = visualize(frames, results)
+    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
+                                fps=args.output_fps)
+    vid.write_videofile(args.out_filename)
+
+    tmp_dir.cleanup()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/demo_video_structuralize.py b/demo/demo_video_structuralize.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ff3e584eabfcc407931e260e346ff04eb79c7a2
--- /dev/null
+++ b/demo/demo_video_structuralize.py
@@ -0,0 +1,672 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy as cp
+import tempfile
+import warnings
+
+import cv2
+import mmcv
+import mmengine
+import numpy as np
+import torch
+from mmengine import DictAction
+from mmengine.structures import InstanceData
+
+from mmaction.apis import (detection_inference, inference_recognizer,
+                           inference_skeleton, init_recognizer, pose_inference)
+from mmaction.registry import VISUALIZERS
+from mmaction.structures import ActionDataSample
+from mmaction.utils import frame_extract
+
+try:
+    from mmdet.apis import init_detector
+except (ImportError, ModuleNotFoundError):
+    warnings.warn('Failed to import `init_detector` form `mmdet.apis`. '
+                  'These apis are required in skeleton-based applications! ')
+
+try:
+    import moviepy.editor as mpy
+except ImportError:
+    raise ImportError('Please install moviepy to enable output file')
+
+FONTFACE = cv2.FONT_HERSHEY_DUPLEX
+FONTSCALE = 0.5
+FONTCOLOR = (255, 255, 255)  # BGR, white
+MSGCOLOR = (128, 128, 128)  # BGR, gray
+THICKNESS = 1
+LINETYPE = 1
+
+
+def hex2color(h):
+    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
+    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
+
+
+PLATEBLUE = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
+PLATEBLUE = PLATEBLUE.split('-')
+PLATEBLUE = [hex2color(h) for h in PLATEBLUE]
+PLATEGREEN = '004b23-006400-007200-008000-38b000-70e000'
+PLATEGREEN = PLATEGREEN.split('-')
+PLATEGREEN = [hex2color(h) for h in PLATEGREEN]
+
+
+def visualize(args,
+              frames,
+              annotations,
+              pose_data_samples,
+              action_result,
+              plate=PLATEBLUE,
+              max_num=5):
+    """Visualize frames with predicted annotations.
+
+    Args:
+        frames (list[np.ndarray]): Frames for visualization, note that
+            len(frames) % len(annotations) should be 0.
+        annotations (list[list[tuple]]): The predicted spatio-temporal
+            detection results.
+        pose_data_samples (list[list[PoseDataSample]): The pose results.
+        action_result (str): The predicted action recognition results.
+        pose_model (nn.Module): The constructed pose model.
+        plate (str): The plate used for visualization. Default: PLATEBLUE.
+        max_num (int): Max number of labels to visualize for a person box.
+            Default: 5.
+
+    Returns:
+        list[np.ndarray]: Visualized frames.
+    """
+
+    assert max_num + 1 <= len(plate)
+    frames_ = cp.deepcopy(frames)
+    frames_ = [mmcv.imconvert(f, 'bgr', 'rgb') for f in frames_]
+    nf, na = len(frames), len(annotations)
+    assert nf % na == 0
+    nfpa = len(frames) // len(annotations)
+    anno = None
+    h, w, _ = frames[0].shape
+    scale_ratio = np.array([w, h, w, h])
+
+    # add pose results
+    if pose_data_samples:
+        pose_config = mmengine.Config.fromfile(args.pose_config)
+        visualizer = VISUALIZERS.build(pose_config.visualizer)
+        visualizer.set_dataset_meta(pose_data_samples[0].dataset_meta)
+        for i, (d, f) in enumerate(zip(pose_data_samples, frames_)):
+            visualizer.add_datasample(
+                'result',
+                f,
+                data_sample=d,
+                draw_gt=False,
+                draw_heatmap=False,
+                draw_bbox=True,
+                show=False,
+                wait_time=0,
+                out_file=None,
+                kpt_thr=0.3)
+            frames_[i] = visualizer.get_image()
+            cv2.putText(frames_[i], action_result, (10, 30), FONTFACE,
+                        FONTSCALE, FONTCOLOR, THICKNESS, LINETYPE)
+
+    for i in range(na):
+        anno = annotations[i]
+        if anno is None:
+            continue
+        for j in range(nfpa):
+            ind = i * nfpa + j
+            frame = frames_[ind]
+
+            # add action result for whole video
+            cv2.putText(frame, action_result, (10, 30), FONTFACE, FONTSCALE,
+                        FONTCOLOR, THICKNESS, LINETYPE)
+
+            # add spatio-temporal action detection results
+            for ann in anno:
+                box = ann[0]
+                label = ann[1]
+                if not len(label):
+                    continue
+                score = ann[2]
+                box = (box * scale_ratio).astype(np.int64)
+                st, ed = tuple(box[:2]), tuple(box[2:])
+                if not pose_data_samples:
+                    cv2.rectangle(frame, st, ed, plate[0], 2)
+
+                for k, lb in enumerate(label):
+                    if k >= max_num:
+                        break
+                    text = abbrev(lb)
+                    text = ': '.join([text, f'{score[k]:.3f}'])
+                    location = (0 + st[0], 18 + k * 18 + st[1])
+                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
+                                               THICKNESS)[0]
+                    textwidth = textsize[0]
+                    diag0 = (location[0] + textwidth, location[1] - 14)
+                    diag1 = (location[0], location[1] + 2)
+                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
+                    cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                                FONTCOLOR, THICKNESS, LINETYPE)
+
+    return frames_
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 demo')
+    parser.add_argument(
+        '--rgb-stdet-config',
+        default=(
+            'configs/detection/slowonly/'
+            'slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py'
+        ),
+        help='rgb-based spatio temporal detection config file path')
+    parser.add_argument(
+        '--rgb-stdet-checkpoint',
+        default=('https://download.openmmlab.com/mmaction/detection/ava/'
+                 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/'
+                 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb'
+                 '_20201217-16378594.pth'),
+        help='rgb-based spatio temporal detection checkpoint file/url')
+    parser.add_argument(
+        '--skeleton-stdet-checkpoint',
+        default=('https://download.openmmlab.com/mmaction/skeleton/posec3d/'
+                 'posec3d_ava.pth'),
+        help='skeleton-based spatio temporal detection checkpoint file/url')
+    parser.add_argument(
+        '--det-config',
+        default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py',
+        help='human detection config file path (from mmdet)')
+    parser.add_argument(
+        '--det-checkpoint',
+        default=('http://download.openmmlab.com/mmdetection/v2.0/'
+                 'faster_rcnn/faster_rcnn_r50_fpn_2x_coco/'
+                 'faster_rcnn_r50_fpn_2x_coco_'
+                 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
+        help='human detection checkpoint file/url')
+    parser.add_argument(
+        '--pose-config',
+        default='demo/demo_configs'
+        '/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py',
+        help='human pose estimation config file path (from mmpose)')
+    parser.add_argument(
+        '--pose-checkpoint',
+        default=('https://download.openmmlab.com/mmpose/top_down/hrnet/'
+                 'hrnet_w32_coco_256x192-c78dce93_20200708.pth'),
+        help='human pose estimation checkpoint file/url')
+    parser.add_argument(
+        '--skeleton-config',
+        default='configs/skeleton/posec3d'
+        '/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py',
+        help='skeleton-based action recognition config file path')
+    parser.add_argument(
+        '--skeleton-checkpoint',
+        default='https://download.openmmlab.com/mmaction/skeleton/posec3d/'
+        'posec3d_k400.pth',
+        help='skeleton-based action recognition checkpoint file/url')
+    parser.add_argument(
+        '--rgb-config',
+        default='configs/recognition/tsn/'
+        'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py',
+        help='rgb-based action recognition config file path')
+    parser.add_argument(
+        '--rgb-checkpoint',
+        default='https://download.openmmlab.com/mmaction/recognition/'
+        'tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/'
+        'tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth',
+        help='rgb-based action recognition checkpoint file/url')
+    parser.add_argument(
+        '--use-skeleton-stdet',
+        action='store_true',
+        help='use skeleton-based spatio temporal detection method')
+    parser.add_argument(
+        '--use-skeleton-recog',
+        action='store_true',
+        help='use skeleton-based action recognition method')
+    parser.add_argument(
+        '--det-score-thr',
+        type=float,
+        default=0.9,
+        help='the threshold of human detection score')
+    parser.add_argument(
+        '--action-score-thr',
+        type=float,
+        default=0.4,
+        help='the threshold of action prediction score')
+    parser.add_argument(
+        '--video',
+        default='demo/test_video_structuralize.mp4',
+        help='video file/url')
+    parser.add_argument(
+        '--label-map-stdet',
+        default='tools/data/ava/label_map.txt',
+        help='label map file for spatio-temporal action detection')
+    parser.add_argument(
+        '--label-map',
+        default='tools/data/kinetics/label_map_k400.txt',
+        help='label map file for action recognition')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--out-filename',
+        default='demo/test_stdet_recognition_output.mp4',
+        help='output filename')
+    parser.add_argument(
+        '--predict-stepsize',
+        default=8,
+        type=int,
+        help='give out a spatio-temporal detection prediction per n frames')
+    parser.add_argument(
+        '--output-stepsize',
+        default=1,
+        type=int,
+        help=('show one frame per n frames in the demo, we should have: '
+              'predict_stepsize % output_stepsize == 0'))
+    parser.add_argument(
+        '--output-fps',
+        default=24,
+        type=int,
+        help='the fps of demo video output')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    args = parser.parse_args()
+    return args
+
+
+def load_label_map(file_path):
+    """Load Label Map.
+
+    Args:
+        file_path (str): The file path of label map.
+
+    Returns:
+        dict: The label map (int -> label name).
+    """
+    lines = open(file_path).readlines()
+    lines = [x.strip().split(': ') for x in lines]
+    return {int(x[0]): x[1] for x in lines}
+
+
+def abbrev(name):
+    """Get the abbreviation of label name:
+
+    'take (an object) from (a person)' -> 'take ... from ...'
+    """
+    while name.find('(') != -1:
+        st, ed = name.find('('), name.find(')')
+        name = name[:st] + '...' + name[ed + 1:]
+    return name
+
+
+def pack_result(human_detection, result, img_h, img_w):
+    """Short summary.
+
+    Args:
+        human_detection (np.ndarray): Human detection result.
+        result (type): The predicted label of each human proposal.
+        img_h (int): The image height.
+        img_w (int): The image width.
+
+    Returns:
+        tuple: Tuple of human proposal, label name and label score.
+    """
+    human_detection[:, 0::2] /= img_w
+    human_detection[:, 1::2] /= img_h
+    results = []
+    if result is None:
+        return None
+    for prop, res in zip(human_detection, result):
+        res.sort(key=lambda x: -x[1])
+        results.append(
+            (prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
+                                                            for x in res]))
+    return results
+
+
+def expand_bbox(bbox, h, w, ratio=1.25):
+    x1, y1, x2, y2 = bbox
+    center_x = (x1 + x2) // 2
+    center_y = (y1 + y2) // 2
+    width = x2 - x1
+    height = y2 - y1
+
+    square_l = max(width, height)
+    new_width = new_height = square_l * ratio
+
+    new_x1 = max(0, int(center_x - new_width / 2))
+    new_x2 = min(int(center_x + new_width / 2), w)
+    new_y1 = max(0, int(center_y - new_height / 2))
+    new_y2 = min(int(center_y + new_height / 2), h)
+    return (new_x1, new_y1, new_x2, new_y2)
+
+
+def cal_iou(box1, box2):
+    xmin1, ymin1, xmax1, ymax1 = box1
+    xmin2, ymin2, xmax2, ymax2 = box2
+
+    s1 = (xmax1 - xmin1) * (ymax1 - ymin1)
+    s2 = (xmax2 - xmin2) * (ymax2 - ymin2)
+
+    xmin = max(xmin1, xmin2)
+    ymin = max(ymin1, ymin2)
+    xmax = min(xmax1, xmax2)
+    ymax = min(ymax1, ymax2)
+
+    w = max(0, xmax - xmin)
+    h = max(0, ymax - ymin)
+    intersect = w * h
+    union = s1 + s2 - intersect
+    iou = intersect / union
+
+    return iou
+
+
+def skeleton_based_action_recognition(args, pose_results, h, w):
+    label_map = [x.strip() for x in open(args.label_map).readlines()]
+    num_class = len(label_map)
+
+    skeleton_config = mmengine.Config.fromfile(args.skeleton_config)
+    skeleton_config.model.cls_head.num_classes = num_class  # for K400 dataset
+
+    skeleton_model = init_recognizer(
+        skeleton_config, args.skeleton_checkpoint, device=args.device)
+    result = inference_skeleton(skeleton_model, pose_results, (h, w))
+    action_idx = result.pred_score.argmax().item()
+    return label_map[action_idx]
+
+
+def rgb_based_action_recognition(args):
+    rgb_config = mmengine.Config.fromfile(args.rgb_config)
+    rgb_config.model.backbone.pretrained = None
+    rgb_model = init_recognizer(rgb_config, args.rgb_checkpoint, args.device)
+    action_results = inference_recognizer(rgb_model, args.video)
+    rgb_action_result = action_results.pred_score.argmax().item()
+    label_map = [x.strip() for x in open(args.label_map).readlines()]
+    return label_map[rgb_action_result]
+
+
+def skeleton_based_stdet(args, label_map, human_detections, pose_results,
+                         num_frame, clip_len, frame_interval, h, w):
+    window_size = clip_len * frame_interval
+    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
+                           args.predict_stepsize)
+
+    skeleton_config = mmengine.Config.fromfile(args.skeleton_config)
+    num_class = max(label_map.keys()) + 1  # for AVA dataset (81)
+    skeleton_config.model.cls_head.num_classes = num_class
+    skeleton_stdet_model = init_recognizer(skeleton_config,
+                                           args.skeleton_stdet_checkpoint,
+                                           args.device)
+
+    skeleton_predictions = []
+
+    print('Performing SpatioTemporal Action Detection for each clip')
+    prog_bar = mmengine.ProgressBar(len(timestamps))
+    for timestamp in timestamps:
+        proposal = human_detections[timestamp - 1]
+        if proposal.shape[0] == 0:  # no people detected
+            skeleton_predictions.append(None)
+            continue
+
+        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
+        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
+        frame_inds = list(frame_inds - 1)
+        num_frame = len(frame_inds)  # 30
+
+        pose_result = [pose_results[ind] for ind in frame_inds]
+
+        skeleton_prediction = []
+        for i in range(proposal.shape[0]):  # num_person
+            skeleton_prediction.append([])
+
+            fake_anno = dict(
+                frame_dict='',
+                label=-1,
+                img_shape=(h, w),
+                origin_shape=(h, w),
+                start_index=0,
+                modality='Pose',
+                total_frames=num_frame)
+            num_person = 1
+
+            num_keypoint = 17
+            keypoint = np.zeros(
+                (num_person, num_frame, num_keypoint, 2))  # M T V 2
+            keypoint_score = np.zeros(
+                (num_person, num_frame, num_keypoint))  # M T V
+
+            # pose matching
+            person_bbox = proposal[i][:4]
+            area = expand_bbox(person_bbox, h, w)
+
+            for j, poses in enumerate(pose_result):  # num_frame
+                max_iou = float('-inf')
+                index = -1
+                if len(poses['keypoints']) == 0:
+                    continue
+                for k, bbox in enumerate(poses['bboxes']):
+                    iou = cal_iou(bbox, area)
+                    if max_iou < iou:
+                        index = k
+                        max_iou = iou
+                keypoint[0, j] = poses['keypoints'][index]
+                keypoint_score[0, j] = poses['keypoint_scores'][index]
+
+            fake_anno['keypoint'] = keypoint
+            fake_anno['keypoint_score'] = keypoint_score
+
+            output = inference_recognizer(skeleton_stdet_model, fake_anno)
+            # for multi-label recognition
+            score = output.pred_score.tolist()
+            for k in range(len(score)):  # 81
+                if k not in label_map:
+                    continue
+                if score[k] > args.action_score_thr:
+                    skeleton_prediction[i].append((label_map[k], score[k]))
+
+        skeleton_predictions.append(skeleton_prediction)
+        prog_bar.update()
+
+    return timestamps, skeleton_predictions
+
+
+def rgb_based_stdet(args, frames, label_map, human_detections, w, h, new_w,
+                    new_h, w_ratio, h_ratio):
+
+    rgb_stdet_config = mmengine.Config.fromfile(args.rgb_stdet_config)
+    rgb_stdet_config.merge_from_dict(args.cfg_options)
+
+    val_pipeline = rgb_stdet_config.val_pipeline
+    sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
+    clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
+    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+
+    window_size = clip_len * frame_interval
+    num_frame = len(frames)
+    # Note that it's 1 based here
+    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
+                           args.predict_stepsize)
+
+    # Get img_norm_cfg
+    img_norm_cfg = dict(
+        mean=np.array(rgb_stdet_config.model.data_preprocessor.mean),
+        std=np.array(rgb_stdet_config.model.data_preprocessor.std),
+        to_rgb=False)
+
+    # Build STDET model
+    try:
+        # In our spatiotemporal detection demo, different actions should have
+        # the same number of bboxes.
+        rgb_stdet_config['model']['test_cfg']['rcnn'] = dict(action_thr=0)
+    except KeyError:
+        pass
+
+    rgb_stdet_config.model.backbone.pretrained = None
+    rgb_stdet_model = init_detector(
+        rgb_stdet_config, args.rgb_stdet_checkpoint, device=args.device)
+
+    predictions = []
+
+    print('Performing SpatioTemporal Action Detection for each clip')
+    prog_bar = mmengine.ProgressBar(len(timestamps))
+    # for timestamp, proposal in zip(timestamps, human_detections):
+    for timestamp in timestamps:
+        proposal = human_detections[timestamp - 1]
+        if proposal.shape[0] == 0:
+            predictions.append(None)
+            continue
+
+        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
+        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
+        frame_inds = list(frame_inds - 1)
+
+        imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
+        _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
+        # THWC -> CTHW -> 1CTHW
+        input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
+        input_tensor = torch.from_numpy(input_array).to(args.device)
+
+        datasample = ActionDataSample()
+        datasample.proposals = InstanceData(bboxes=proposal)
+        datasample.set_metainfo(dict(img_shape=(new_h, new_w)))
+        with torch.no_grad():
+            result = rgb_stdet_model(
+                input_tensor, [datasample], mode='predict')
+            scores = result[0].pred_instances.scores
+            prediction = []
+            # N proposals
+            for i in range(proposal.shape[0]):
+                prediction.append([])
+            # Perform action score thr
+            for i in range(scores.shape[1]):
+                if i not in label_map:
+                    continue
+                for j in range(proposal.shape[0]):
+                    if scores[j, i] > args.action_score_thr:
+                        prediction[j].append((label_map[i], scores[j,
+                                                                   i].item()))
+            predictions.append(prediction)
+        prog_bar.update()
+
+    return timestamps, predictions
+
+
+def main():
+    args = parse_args()
+    tmp_dir = tempfile.TemporaryDirectory()
+    frame_paths, original_frames = frame_extract(
+        args.video, out_dir=tmp_dir.name)
+    num_frame = len(frame_paths)
+    h, w, _ = original_frames[0].shape
+
+    # Get Human detection results and pose results
+    human_detections, _ = detection_inference(
+        args.det_config,
+        args.det_checkpoint,
+        frame_paths,
+        args.det_score_thr,
+        device=args.device)
+    pose_datasample = None
+    if args.use_skeleton_recog or args.use_skeleton_stdet:
+        pose_results, pose_datasample = pose_inference(
+            args.pose_config,
+            args.pose_checkpoint,
+            frame_paths,
+            human_detections,
+            device=args.device)
+
+    # resize frames to shortside 256
+    new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf))
+    frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
+    w_ratio, h_ratio = new_w / w, new_h / h
+
+    # Load spatio-temporal detection label_map
+    stdet_label_map = load_label_map(args.label_map_stdet)
+    rgb_stdet_config = mmengine.Config.fromfile(args.rgb_stdet_config)
+    rgb_stdet_config.merge_from_dict(args.cfg_options)
+    try:
+        if rgb_stdet_config['data']['train']['custom_classes'] is not None:
+            stdet_label_map = {
+                id + 1: stdet_label_map[cls]
+                for id, cls in enumerate(rgb_stdet_config['data']['train']
+                                         ['custom_classes'])
+            }
+    except KeyError:
+        pass
+
+    action_result = None
+    if args.use_skeleton_recog:
+        print('Use skeleton-based recognition')
+        action_result = skeleton_based_action_recognition(
+            args, pose_results, h, w)
+    else:
+        print('Use rgb-based recognition')
+        action_result = rgb_based_action_recognition(args)
+
+    stdet_preds = None
+    if args.use_skeleton_stdet:
+        print('Use skeleton-based SpatioTemporal Action Detection')
+        clip_len, frame_interval = 30, 1
+        timestamps, stdet_preds = skeleton_based_stdet(args, stdet_label_map,
+                                                       human_detections,
+                                                       pose_results, num_frame,
+                                                       clip_len,
+                                                       frame_interval, h, w)
+        for i in range(len(human_detections)):
+            det = human_detections[i]
+            det[:, 0:4:2] *= w_ratio
+            det[:, 1:4:2] *= h_ratio
+            human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
+
+    else:
+        print('Use rgb-based SpatioTemporal Action Detection')
+        for i in range(len(human_detections)):
+            det = human_detections[i]
+            det[:, 0:4:2] *= w_ratio
+            det[:, 1:4:2] *= h_ratio
+            human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
+        timestamps, stdet_preds = rgb_based_stdet(args, frames,
+                                                  stdet_label_map,
+                                                  human_detections, w, h,
+                                                  new_w, new_h, w_ratio,
+                                                  h_ratio)
+
+    stdet_results = []
+    for timestamp, prediction in zip(timestamps, stdet_preds):
+        human_detection = human_detections[timestamp - 1]
+        stdet_results.append(
+            pack_result(human_detection, prediction, new_h, new_w))
+
+    def dense_timestamps(timestamps, n):
+        """Make it nx frames."""
+        old_frame_interval = (timestamps[1] - timestamps[0])
+        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+        new_frame_inds = np.arange(
+            len(timestamps) * n) * old_frame_interval / n + start
+        return new_frame_inds.astype(np.int64)
+
+    dense_n = int(args.predict_stepsize / args.output_stepsize)
+    output_timestamps = dense_timestamps(timestamps, dense_n)
+    frames = [
+        cv2.imread(frame_paths[timestamp - 1])
+        for timestamp in output_timestamps
+    ]
+
+    if args.use_skeleton_recog or args.use_skeleton_stdet:
+        pose_datasample = [
+            pose_datasample[timestamp - 1] for timestamp in output_timestamps
+        ]
+
+    vis_frames = visualize(args, frames, stdet_results, pose_datasample,
+                           action_result)
+    vid = mpy.ImageSequenceClip(vis_frames, fps=args.output_fps)
+    vid.write_videofile(args.out_filename)
+
+    tmp_dir.cleanup()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/long_video_demo.py b/demo/long_video_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2afe3f28395b07414776e93fe11ff3b864966ea
--- /dev/null
+++ b/demo/long_video_demo.py
@@ -0,0 +1,270 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import argparse
+import json
+import random
+from collections import deque
+from operator import itemgetter
+
+import cv2
+import mmengine
+import numpy as np
+import torch
+from mmengine import Config, DictAction
+from mmengine.dataset import Compose
+
+from mmaction.apis import inference_recognizer, init_recognizer
+
+FONTFACE = cv2.FONT_HERSHEY_COMPLEX_SMALL
+FONTSCALE = 1
+THICKNESS = 1
+LINETYPE = 1
+
+EXCLUED_STEPS = [
+    'OpenCVInit', 'OpenCVDecode', 'DecordInit', 'DecordDecode', 'PyAVInit',
+    'PyAVDecode', 'RawFrameDecode'
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMAction2 predict different labels in a long video demo')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file/url')
+    parser.add_argument('video_path', help='video file/url')
+    parser.add_argument('label', help='label file')
+    parser.add_argument('out_file', help='output result file in video/json')
+    parser.add_argument(
+        '--input-step',
+        type=int,
+        default=1,
+        help='input step for sampling frames')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--threshold',
+        type=float,
+        default=0.01,
+        help='recognition score threshold')
+    parser.add_argument(
+        '--stride',
+        type=float,
+        default=0,
+        help=('the prediction stride equals to stride * sample_length '
+              '(sample_length indicates the size of temporal window from '
+              'which you sample frames, which equals to '
+              'clip_len x frame_interval), if set as 0, the '
+              'prediction stride is 1'))
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--label-color',
+        nargs='+',
+        type=int,
+        default=(255, 255, 255),
+        help='font color (B, G, R) of the labels in output video')
+    parser.add_argument(
+        '--msg-color',
+        nargs='+',
+        type=int,
+        default=(128, 128, 128),
+        help='font color (B, G, R) of the messages in output video')
+    args = parser.parse_args()
+    return args
+
+
+def show_results_video(result_queue,
+                       text_info,
+                       thr,
+                       msg,
+                       frame,
+                       video_writer,
+                       label_color=(255, 255, 255),
+                       msg_color=(128, 128, 128)):
+    if len(result_queue) != 0:
+        text_info = {}
+        results = result_queue.popleft()
+        for i, result in enumerate(results):
+            selected_label, score = result
+            if score < thr:
+                break
+            location = (0, 40 + i * 20)
+            text = selected_label + ': ' + str(round(score, 2))
+            text_info[location] = text
+            cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                        label_color, THICKNESS, LINETYPE)
+    elif len(text_info):
+        for location, text in text_info.items():
+            cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                        label_color, THICKNESS, LINETYPE)
+    else:
+        cv2.putText(frame, msg, (0, 40), FONTFACE, FONTSCALE, msg_color,
+                    THICKNESS, LINETYPE)
+    video_writer.write(frame)
+    return text_info
+
+
+def get_results_json(result_queue, text_info, thr, msg, ind, out_json):
+    if len(result_queue) != 0:
+        text_info = {}
+        results = result_queue.popleft()
+        for i, result in enumerate(results):
+            selected_label, score = result
+            if score < thr:
+                break
+            text_info[i + 1] = selected_label + ': ' + str(round(score, 2))
+        out_json[ind] = text_info
+    elif len(text_info):
+        out_json[ind] = text_info
+    else:
+        out_json[ind] = msg
+    return text_info, out_json
+
+
+def show_results(model, data, label, args):
+    frame_queue = deque(maxlen=args.sample_length)
+    result_queue = deque(maxlen=1)
+
+    cap = cv2.VideoCapture(args.video_path)
+    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+
+    msg = 'Preparing action recognition ...'
+    text_info = {}
+    out_json = {}
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    frame_size = (frame_width, frame_height)
+
+    ind = 0
+    video_writer = None if args.out_file.endswith('.json') \
+        else cv2.VideoWriter(args.out_file, fourcc, fps, frame_size)
+    prog_bar = mmengine.ProgressBar(num_frames)
+    backup_frames = []
+
+    while ind < num_frames:
+        ind += 1
+        prog_bar.update()
+        ret, frame = cap.read()
+        if frame is None:
+            # drop it when encounting None
+            continue
+        backup_frames.append(np.array(frame)[:, :, ::-1])
+        if ind == args.sample_length:
+            # provide a quick show at the beginning
+            frame_queue.extend(backup_frames)
+            backup_frames = []
+        elif ((len(backup_frames) == args.input_step
+               and ind > args.sample_length) or ind == num_frames):
+            # pick a frame from the backup
+            # when the backup is full or reach the last frame
+            chosen_frame = random.choice(backup_frames)
+            backup_frames = []
+            frame_queue.append(chosen_frame)
+
+        ret, scores = inference(model, data, args, frame_queue)
+
+        if ret:
+            num_selected_labels = min(len(label), 5)
+            scores_tuples = tuple(zip(label, scores))
+            scores_sorted = sorted(
+                scores_tuples, key=itemgetter(1), reverse=True)
+            results = scores_sorted[:num_selected_labels]
+            result_queue.append(results)
+
+        if args.out_file.endswith('.json'):
+            text_info, out_json = get_results_json(result_queue, text_info,
+                                                   args.threshold, msg, ind,
+                                                   out_json)
+        else:
+            text_info = show_results_video(result_queue, text_info,
+                                           args.threshold, msg, frame,
+                                           video_writer, args.label_color,
+                                           args.msg_color)
+
+    cap.release()
+    if video_writer:
+        video_writer.release()
+    cv2.destroyAllWindows()
+    if args.out_file.endswith('.json'):
+        with open(args.out_file, 'w') as js:
+            json.dump(out_json, js)
+
+
+def inference(model, data, args, frame_queue):
+    if len(frame_queue) != args.sample_length:
+        # Do no inference when there is no enough frames
+        return False, None
+
+    cur_windows = list(np.array(frame_queue))
+    if data['img_shape'] is None:
+        data['img_shape'] = frame_queue[0].shape[:2]
+
+    cur_data = data.copy()
+    cur_data.update(
+        dict(
+            array=cur_windows,
+            modality='RGB',
+            frame_inds=np.arange(args.sample_length)))
+
+    result = inference_recognizer(
+        model, cur_data, test_pipeline=args.test_pipeline)
+    scores = result.pred_score.tolist()
+
+    if args.stride > 0:
+        pred_stride = int(args.sample_length * args.stride)
+        for _ in range(pred_stride):
+            frame_queue.popleft()
+
+    # for case ``args.stride=0``
+    # deque will automatically popleft one element
+
+    return True, scores
+
+
+def main():
+    args = parse_args()
+
+    args.device = torch.device(args.device)
+
+    cfg = Config.fromfile(args.config)
+    cfg.merge_from_dict(args.cfg_options)
+
+    model = init_recognizer(cfg, args.checkpoint, device=args.device)
+    data = dict(img_shape=None, modality='RGB', label=-1)
+    with open(args.label, 'r') as f:
+        label = [line.strip() for line in f]
+
+    # prepare test pipeline from non-camera pipeline
+    cfg = model.cfg
+    sample_length = 0
+    pipeline = cfg.test_pipeline
+    pipeline_ = pipeline.copy()
+    for step in pipeline:
+        if 'SampleFrames' in step['type']:
+            sample_length = step['clip_len'] * step['num_clips']
+            data['num_clips'] = step['num_clips']
+            data['clip_len'] = step['clip_len']
+            pipeline_.remove(step)
+        if step['type'] in EXCLUED_STEPS:
+            # remove step to decode frames
+            pipeline_.remove(step)
+    pipeline_.insert(1, dict(type='ArrayDecode'))
+    test_pipeline = Compose(pipeline_)
+
+    assert sample_length > 0
+    args.sample_length = sample_length
+    args.test_pipeline = test_pipeline
+
+    show_results(model, data, label, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/mmaction2_tutorial.ipynb b/demo/mmaction2_tutorial.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..c5ca4e478c6553fe1ae8578524113c982a21d046
--- /dev/null
+++ b/demo/mmaction2_tutorial.ipynb
@@ -0,0 +1,1936 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "VcjSRFELVbNk"
+   },
+   "source": [
+    "# MMAction2 Tutorial\n",
+    "\n",
+    "Welcome to MMAction2! This is the official colab tutorial for using MMAction2. In this tutorial, you will learn\n",
+    "- Perform inference with a MMAction2 recognizer.\n",
+    "- Train a new recognizer with a new dataset.\n",
+    "\n",
+    "\n",
+    "Let's start!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "7LqHGkGEVqpm"
+   },
+   "source": [
+    "## Install MMAction2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "Bf8PpPXtVvmg",
+    "outputId": "9d3f4594-f151-4ee9-a19b-09f8a439ac04"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "nvcc: NVIDIA (R) Cuda compiler driver\n",
+      "Copyright (c) 2005-2022 NVIDIA Corporation\n",
+      "Built on Wed_Sep_21_10:33:58_PDT_2022\n",
+      "Cuda compilation tools, release 11.8, V11.8.89\n",
+      "Build cuda_11.8.r11.8/compiler.31833905_0\n",
+      "gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+      "Copyright (C) 2019 Free Software Foundation, Inc.\n",
+      "This is free software; see the source for copying conditions.  There is NO\n",
+      "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check nvcc version\n",
+    "!nvcc -V\n",
+    "# Check GCC version\n",
+    "!gcc --version"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "ZPwKGzqydnb2",
+    "outputId": "27506fa7-48a2-4fe0-d377-56f940dafec4",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    }
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Looking in indexes: https://download.pytorch.org/whl/cu118, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+      "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.0+cu118)\n",
+      "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (0.15.1+cu118)\n",
+      "Requirement already satisfied: torchaudio in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.12.0)\n",
+      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n",
+      "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n",
+      "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n",
+      "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n",
+      "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.3)\n",
+      "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from torchvision) (1.22.4)\n",
+      "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from torchvision) (2.27.1)\n",
+      "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision) (8.4.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.2)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (1.26.15)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (2022.12.7)\n",
+      "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (2.0.12)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (3.4)\n",
+      "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# install dependencies: (if your colab has CUDA 11.8)\n",
+    "%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "5PAJ4ArzV5Ry",
+    "outputId": "eb8539a0-9524-4c48-f3e1-0b013ce0d344"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+      "Collecting openmim\n",
+      "  Downloading openmim-0.3.7-py2.py3-none-any.whl (51 kB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m51.3/51.3 kB\u001B[0m \u001B[31m4.0 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25hRequirement already satisfied: Click in /usr/local/lib/python3.10/dist-packages (from openmim) (8.1.3)\n",
+      "Collecting colorama (from openmim)\n",
+      "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
+      "Collecting model-index (from openmim)\n",
+      "  Downloading model_index-0.1.11-py3-none-any.whl (34 kB)\n",
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from openmim) (1.5.3)\n",
+      "Requirement already satisfied: pip>=19.3 in /usr/local/lib/python3.10/dist-packages (from openmim) (23.1.2)\n",
+      "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from openmim) (2.27.1)\n",
+      "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from openmim) (13.3.4)\n",
+      "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from openmim) (0.8.10)\n",
+      "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (6.0)\n",
+      "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (3.4.3)\n",
+      "Collecting ordered-set (from model-index->openmim)\n",
+      "  Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2022.7.1)\n",
+      "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (1.22.4)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (1.26.15)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2022.12.7)\n",
+      "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2.0.12)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (3.4)\n",
+      "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.2.0)\n",
+      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.14.0)\n",
+      "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->openmim) (0.1.2)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->openmim) (1.16.0)\n",
+      "Installing collected packages: ordered-set, colorama, model-index, openmim\n",
+      "Successfully installed colorama-0.4.6 model-index-0.1.11 openmim-0.3.7 ordered-set-4.1.0\n",
+      "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+      "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+      "Collecting mmengine\n",
+      "  Downloading mmengine-0.7.3-py3-none-any.whl (372 kB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m372.1/372.1 kB\u001B[0m \u001B[31m20.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25hCollecting addict (from mmengine)\n",
+      "  Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
+      "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine) (3.7.1)\n",
+      "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmengine) (1.22.4)\n",
+      "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmengine) (6.0)\n",
+      "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine) (13.3.4)\n",
+      "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine) (2.3.0)\n",
+      "Collecting yapf (from mmengine)\n",
+      "  Downloading yapf-0.33.0-py2.py3-none-any.whl (200 kB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m200.9/200.9 kB\u001B[0m \u001B[31m21.0 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25hRequirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmengine) (4.7.0.72)\n",
+      "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.0.7)\n",
+      "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (0.11.0)\n",
+      "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (4.39.3)\n",
+      "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.4.4)\n",
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (23.1)\n",
+      "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (8.4.0)\n",
+      "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (3.0.9)\n",
+      "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (2.8.2)\n",
+      "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.2.0)\n",
+      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.14.0)\n",
+      "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmengine) (2.0.1)\n",
+      "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine) (0.1.2)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine) (1.16.0)\n",
+      "Installing collected packages: addict, yapf, mmengine\n",
+      "Successfully installed addict-2.4.0 mmengine-0.7.3 yapf-0.33.0\n",
+      "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+      "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+      "Collecting mmcv>=2.0.0\n",
+      "  Downloading https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/mmcv-2.0.0-cp310-cp310-manylinux1_x86_64.whl (74.4 MB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m74.4/74.4 MB\u001B[0m \u001B[31m9.7 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25hRequirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (2.4.0)\n",
+      "Requirement already satisfied: mmengine>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (0.7.3)\n",
+      "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (1.22.4)\n",
+      "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (23.1)\n",
+      "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (8.4.0)\n",
+      "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (6.0)\n",
+      "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (0.33.0)\n",
+      "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv>=2.0.0) (4.7.0.72)\n",
+      "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv>=2.0.0) (3.7.1)\n",
+      "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv>=2.0.0) (13.3.4)\n",
+      "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv>=2.0.0) (2.3.0)\n",
+      "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv>=2.0.0) (2.0.1)\n",
+      "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (1.0.7)\n",
+      "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (0.11.0)\n",
+      "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (4.39.3)\n",
+      "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (1.4.4)\n",
+      "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (3.0.9)\n",
+      "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (2.8.2)\n",
+      "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv>=2.0.0) (2.2.0)\n",
+      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv>=2.0.0) (2.14.0)\n",
+      "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine>=0.2.0->mmcv>=2.0.0) (0.1.2)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine>=0.2.0->mmcv>=2.0.0) (1.16.0)\n",
+      "Installing collected packages: mmcv\n",
+      "Successfully installed mmcv-2.0.0\n",
+      "Cloning into 'mmaction2'...\n",
+      "remote: Enumerating objects: 21284, done.\u001B[K\n",
+      "remote: Counting objects: 100% (394/394), done.\u001B[K\n",
+      "remote: Compressing objects: 100% (287/287), done.\u001B[K\n",
+      "remote: Total 21284 (delta 175), reused 248 (delta 103), pack-reused 20890\u001B[K\n",
+      "Receiving objects: 100% (21284/21284), 68.63 MiB | 16.59 MiB/s, done.\n",
+      "Resolving deltas: 100% (14990/14990), done.\n",
+      "/content/mmaction2\n",
+      "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+      "Obtaining file:///content/mmaction2\n",
+      "  Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "Collecting decord>=0.4.1 (from mmaction2==1.0.0)\n",
+      "  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m13.6/13.6 MB\u001B[0m \u001B[31m76.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25hCollecting einops (from mmaction2==1.0.0)\n",
+      "  Downloading einops-0.6.1-py3-none-any.whl (42 kB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m42.2/42.2 kB\u001B[0m \u001B[31m4.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (3.7.1)\n",
+      "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.22.4)\n",
+      "Requirement already satisfied: opencv-contrib-python in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (4.7.0.72)\n",
+      "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (8.4.0)\n",
+      "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.10.1)\n",
+      "Requirement already satisfied: torch>=1.3 in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (2.0.0+cu118)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.12.0)\n",
+      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (4.5.0)\n",
+      "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (1.11.1)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1.2)\n",
+      "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (2.0.0)\n",
+      "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (3.25.2)\n",
+      "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (16.0.3)\n",
+      "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.0.7)\n",
+      "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (0.11.0)\n",
+      "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (4.39.3)\n",
+      "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.4.4)\n",
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (23.1)\n",
+      "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (3.0.9)\n",
+      "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (2.8.2)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmaction2==1.0.0) (1.16.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.3->mmaction2==1.0.0) (2.1.2)\n",
+      "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.3->mmaction2==1.0.0) (1.3.0)\n",
+      "Installing collected packages: einops, decord, mmaction2\n",
+      "  Running setup.py develop for mmaction2\n",
+      "Successfully installed decord-0.6.0 einops-0.6.1 mmaction2-1.0.0\n",
+      "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+      "Collecting av>=9.0 (from -r requirements/optional.txt (line 1))\n",
+      "  Downloading av-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.0 MB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m31.0/31.0 MB\u001B[0m \u001B[31m38.3 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25hRequirement already satisfied: future in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 2)) (0.18.3)\n",
+      "Collecting fvcore (from -r requirements/optional.txt (line 3))\n",
+      "  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m50.2/50.2 kB\u001B[0m \u001B[31m6.7 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25h  Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "Requirement already satisfied: imgaug in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 4)) (0.4.0)\n",
+      "Requirement already satisfied: librosa in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 5)) (0.10.0.post2)\n",
+      "Collecting lmdb (from -r requirements/optional.txt (line 6))\n",
+      "  Downloading lmdb-1.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (299 kB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m299.2/299.2 kB\u001B[0m \u001B[31m30.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25hRequirement already satisfied: moviepy in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 7)) (1.0.3)\n",
+      "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 8)) (23.1)\n",
+      "Collecting pims (from -r requirements/optional.txt (line 9))\n",
+      "  Downloading PIMS-0.6.1.tar.gz (86 kB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m86.0/86.0 kB\u001B[0m \u001B[31m12.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25h  Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "Collecting PyTurboJPEG (from -r requirements/optional.txt (line 10))\n",
+      "  Downloading PyTurboJPEG-1.7.1.tar.gz (11 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "Requirement already satisfied: soundfile in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 11)) (0.12.1)\n",
+      "Requirement already satisfied: tensorboard in /usr/local/lib/python3.10/dist-packages (from -r requirements/optional.txt (line 12)) (2.12.2)\n",
+      "Collecting wandb (from -r requirements/optional.txt (line 13))\n",
+      "  Downloading wandb-0.15.2-py3-none-any.whl (2.0 MB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m2.0/2.0 MB\u001B[0m \u001B[31m79.1 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from fvcore->-r requirements/optional.txt (line 3)) (1.22.4)\n",
+      "Collecting yacs>=0.1.6 (from fvcore->-r requirements/optional.txt (line 3))\n",
+      "  Downloading yacs-0.1.8-py3-none-any.whl (14 kB)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from fvcore->-r requirements/optional.txt (line 3)) (6.0)\n",
+      "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from fvcore->-r requirements/optional.txt (line 3)) (4.65.0)\n",
+      "Requirement already satisfied: termcolor>=1.1 in /usr/local/lib/python3.10/dist-packages (from fvcore->-r requirements/optional.txt (line 3)) (2.3.0)\n",
+      "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from fvcore->-r requirements/optional.txt (line 3)) (8.4.0)\n",
+      "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from fvcore->-r requirements/optional.txt (line 3)) (0.8.10)\n",
+      "Collecting iopath>=0.1.7 (from fvcore->-r requirements/optional.txt (line 3))\n",
+      "  Downloading iopath-0.1.10.tar.gz (42 kB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m42.2/42.2 kB\u001B[0m \u001B[31m4.8 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25h  Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (1.16.0)\n",
+      "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (1.10.1)\n",
+      "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (3.7.1)\n",
+      "Requirement already satisfied: scikit-image>=0.14.2 in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (0.19.3)\n",
+      "Requirement already satisfied: opencv-python in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (4.7.0.72)\n",
+      "Requirement already satisfied: imageio in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (2.25.1)\n",
+      "Requirement already satisfied: Shapely in /usr/local/lib/python3.10/dist-packages (from imgaug->-r requirements/optional.txt (line 4)) (2.0.1)\n",
+      "Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (3.0.0)\n",
+      "Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (1.2.2)\n",
+      "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (1.2.0)\n",
+      "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (4.4.2)\n",
+      "Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (0.56.4)\n",
+      "Requirement already satisfied: pooch<1.7,>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (1.6.0)\n",
+      "Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (0.3.5)\n",
+      "Requirement already satisfied: typing-extensions>=4.1.1 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (4.5.0)\n",
+      "Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (0.2)\n",
+      "Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements/optional.txt (line 5)) (1.0.5)\n",
+      "Requirement already satisfied: requests<3.0,>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from moviepy->-r requirements/optional.txt (line 7)) (2.27.1)\n",
+      "Requirement already satisfied: proglog<=1.0.0 in /usr/local/lib/python3.10/dist-packages (from moviepy->-r requirements/optional.txt (line 7)) (0.1.10)\n",
+      "Requirement already satisfied: imageio-ffmpeg>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from moviepy->-r requirements/optional.txt (line 7)) (0.4.8)\n",
+      "Collecting slicerator>=0.9.8 (from pims->-r requirements/optional.txt (line 9))\n",
+      "  Downloading slicerator-1.1.0-py3-none-any.whl (10 kB)\n",
+      "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile->-r requirements/optional.txt (line 11)) (1.15.1)\n",
+      "Requirement already satisfied: absl-py>=0.4 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (1.4.0)\n",
+      "Requirement already satisfied: grpcio>=1.48.2 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (1.54.0)\n",
+      "Requirement already satisfied: google-auth<3,>=1.6.3 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (2.17.3)\n",
+      "Requirement already satisfied: google-auth-oauthlib<1.1,>=0.5 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (1.0.0)\n",
+      "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (3.4.3)\n",
+      "Requirement already satisfied: protobuf>=3.19.6 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (3.20.3)\n",
+      "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (67.7.2)\n",
+      "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (0.7.0)\n",
+      "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (1.8.1)\n",
+      "Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (2.3.0)\n",
+      "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.10/dist-packages (from tensorboard->-r requirements/optional.txt (line 12)) (0.40.0)\n",
+      "Requirement already satisfied: Click!=8.0.0,>=7.0 in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements/optional.txt (line 13)) (8.1.3)\n",
+      "Collecting GitPython!=3.1.29,>=1.0.0 (from wandb->-r requirements/optional.txt (line 13))\n",
+      "  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m184.3/184.3 kB\u001B[0m \u001B[31m22.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25hRequirement already satisfied: psutil>=5.0.0 in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements/optional.txt (line 13)) (5.9.5)\n",
+      "Collecting sentry-sdk>=1.0.0 (from wandb->-r requirements/optional.txt (line 13))\n",
+      "  Downloading sentry_sdk-1.22.2-py2.py3-none-any.whl (203 kB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m203.3/203.3 kB\u001B[0m \u001B[31m25.7 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25hCollecting docker-pycreds>=0.4.0 (from wandb->-r requirements/optional.txt (line 13))\n",
+      "  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\n",
+      "Collecting pathtools (from wandb->-r requirements/optional.txt (line 13))\n",
+      "  Downloading pathtools-0.1.2.tar.gz (11 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "Collecting setproctitle (from wandb->-r requirements/optional.txt (line 13))\n",
+      "  Downloading setproctitle-1.3.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\n",
+      "Requirement already satisfied: appdirs>=1.4.3 in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements/optional.txt (line 13)) (1.4.4)\n",
+      "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile->-r requirements/optional.txt (line 11)) (2.21)\n",
+      "Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb->-r requirements/optional.txt (line 13))\n",
+      "  Downloading gitdb-4.0.10-py3-none-any.whl (62 kB)\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m62.7/62.7 kB\u001B[0m \u001B[31m9.0 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+      "\u001B[?25hRequirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements/optional.txt (line 12)) (5.3.0)\n",
+      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements/optional.txt (line 12)) (0.3.0)\n",
+      "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements/optional.txt (line 12)) (4.9)\n",
+      "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from google-auth-oauthlib<1.1,>=0.5->tensorboard->-r requirements/optional.txt (line 12)) (1.3.1)\n",
+      "Collecting portalocker (from iopath>=0.1.7->fvcore->-r requirements/optional.txt (line 3))\n",
+      "  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)\n",
+      "Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.0->librosa->-r requirements/optional.txt (line 5)) (0.39.1)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.8.1->moviepy->-r requirements/optional.txt (line 7)) (1.26.15)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.8.1->moviepy->-r requirements/optional.txt (line 7)) (2022.12.7)\n",
+      "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.8.1->moviepy->-r requirements/optional.txt (line 7)) (2.0.12)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.8.1->moviepy->-r requirements/optional.txt (line 7)) (3.4)\n",
+      "Requirement already satisfied: networkx>=2.2 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.14.2->imgaug->-r requirements/optional.txt (line 4)) (3.1)\n",
+      "Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.14.2->imgaug->-r requirements/optional.txt (line 4)) (2023.4.12)\n",
+      "Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.14.2->imgaug->-r requirements/optional.txt (line 4)) (1.4.1)\n",
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20.0->librosa->-r requirements/optional.txt (line 5)) (3.1.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/lib/python3.10/dist-packages (from werkzeug>=1.0.1->tensorboard->-r requirements/optional.txt (line 12)) (2.1.2)\n",
+      "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->imgaug->-r requirements/optional.txt (line 4)) (1.0.7)\n",
+      "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->imgaug->-r requirements/optional.txt (line 4)) (0.11.0)\n",
+      "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->imgaug->-r requirements/optional.txt (line 4)) (4.39.3)\n",
+      "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->imgaug->-r requirements/optional.txt (line 4)) (1.4.4)\n",
+      "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->imgaug->-r requirements/optional.txt (line 4)) (3.0.9)\n",
+      "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->imgaug->-r requirements/optional.txt (line 4)) (2.8.2)\n",
+      "Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb->-r requirements/optional.txt (line 13))\n",
+      "  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)\n",
+      "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard->-r requirements/optional.txt (line 12)) (0.5.0)\n",
+      "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<1.1,>=0.5->tensorboard->-r requirements/optional.txt (line 12)) (3.2.2)\n",
+      "Building wheels for collected packages: fvcore, pims, PyTurboJPEG, iopath, pathtools\n",
+      "  Building wheel for fvcore (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "  Created wheel for fvcore: filename=fvcore-0.1.5.post20221221-py3-none-any.whl size=61405 sha256=25c1e50155c8788d00eec898793c96133a746a8bb076ffc5c01f5a4dc256751e\n",
+      "  Stored in directory: /root/.cache/pip/wheels/01/c0/af/77c1cf53a1be9e42a52b48e5af2169d40ec2e89f7362489dd0\n",
+      "  Building wheel for pims (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "  Created wheel for pims: filename=PIMS-0.6.1-py3-none-any.whl size=82619 sha256=59a328dc88a438c60cfb6e937e04c8a7dd55ad2a2905034cd41ff80cdbba6497\n",
+      "  Stored in directory: /root/.cache/pip/wheels/cc/bf/3e/bfa77232d942f8244145f9c713b6b38f6ef04b6fb5c021c114\n",
+      "  Building wheel for PyTurboJPEG (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "  Created wheel for PyTurboJPEG: filename=PyTurboJPEG-1.7.1-py3-none-any.whl size=12243 sha256=ddf6424c85ac533335abd96dd9e98b014ea1dd4f143c88cd35ecb08d6128f411\n",
+      "  Stored in directory: /root/.cache/pip/wheels/de/6e/b1/e7ba70c328c3395555cb92ca8820babb32950d867858b1948b\n",
+      "  Building wheel for iopath (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "  Created wheel for iopath: filename=iopath-0.1.10-py3-none-any.whl size=31531 sha256=db977a4344bebbdd710665e767caab4fbcf53cc6aea0707cd38d26c45718331e\n",
+      "  Stored in directory: /root/.cache/pip/wheels/9a/a3/b6/ac0fcd1b4ed5cfeb3db92e6a0e476cfd48ed0df92b91080c1d\n",
+      "  Building wheel for pathtools (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "  Created wheel for pathtools: filename=pathtools-0.1.2-py3-none-any.whl size=8791 sha256=08bb5753ce029aef01f25c3e81882d93c0e040e5932e90a02a062ad058756b52\n",
+      "  Stored in directory: /root/.cache/pip/wheels/e7/f3/22/152153d6eb222ee7a56ff8617d80ee5207207a8c00a7aab794\n",
+      "Successfully built fvcore pims PyTurboJPEG iopath pathtools\n",
+      "Installing collected packages: slicerator, pathtools, lmdb, av, yacs, smmap, setproctitle, sentry-sdk, PyTurboJPEG, portalocker, docker-pycreds, pims, iopath, gitdb, GitPython, fvcore, wandb\n",
+      "Successfully installed GitPython-3.1.31 PyTurboJPEG-1.7.1 av-10.0.0 docker-pycreds-0.4.0 fvcore-0.1.5.post20221221 gitdb-4.0.10 iopath-0.1.10 lmdb-1.4.1 pathtools-0.1.2 pims-0.6.1 portalocker-2.7.0 sentry-sdk-1.22.2 setproctitle-1.3.2 slicerator-1.1.0 smmap-5.0.0 wandb-0.15.2 yacs-0.1.8\n"
+     ]
+    }
+   ],
+   "source": [
+    "# install MMEngine, MMCV and MMDetection using MIM\n",
+    "%pip install -U openmim\n",
+    "!mim install mmengine\n",
+    "!mim install \"mmcv>=2.0.0\"\n",
+    "\n",
+    "# Install mmaction2\n",
+    "!rm -rf mmaction2\n",
+    "!git clone https://github.com/open-mmlab/mmaction2.git -b main\n",
+    "%cd mmaction2\n",
+    "\n",
+    "!pip install -e .\n",
+    "\n",
+    "# Install some optional requirements\n",
+    "!pip install -r requirements/optional.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "No_zZAFpWC-a",
+    "outputId": "9386dd81-2308-4adb-d3cb-798de11c035e"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "2.0.0+cu118 True\n",
+      "1.0.0\n",
+      "11.8\n",
+      "GCC 9.3\n",
+      "OrderedDict([('sys.platform', 'linux'), ('Python', '3.10.11 (main, Apr  5 2023, 14:15:10) [GCC 9.4.0]'), ('CUDA available', True), ('numpy_random_seed', 2147483648), ('GPU 0', 'Tesla T4'), ('CUDA_HOME', '/usr/local/cuda'), ('NVCC', 'Cuda compilation tools, release 11.8, V11.8.89'), ('GCC', 'x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0'), ('PyTorch', '2.0.0+cu118'), ('PyTorch compiling details', 'PyTorch built with:\\n  - GCC 9.3\\n  - C++ Version: 201703\\n  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\\n  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\\n  - OpenMP 201511 (a.k.a. OpenMP 4.5)\\n  - LAPACK is enabled (usually provided by MKL)\\n  - NNPACK is enabled\\n  - CPU capability usage: AVX2\\n  - CUDA Runtime 11.8\\n  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\\n  - CuDNN 8.7\\n  - Magma 2.6.1\\n  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \\n'), ('TorchVision', '0.15.1+cu118'), ('OpenCV', '4.7.0'), ('MMEngine', '0.7.3')])\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check Pytorch installation\n",
+    "import torch, torchvision\n",
+    "print(torch.__version__, torch.cuda.is_available())\n",
+    "\n",
+    "# Check MMAction2 installation\n",
+    "import mmaction\n",
+    "print(mmaction.__version__)\n",
+    "\n",
+    "# Check MMCV installation\n",
+    "from mmcv.ops import get_compiling_cuda_version, get_compiler_version\n",
+    "print(get_compiling_cuda_version())\n",
+    "print(get_compiler_version())\n",
+    "\n",
+    "# Check MMEngine installation\n",
+    "from mmengine.utils.dl_utils import collect_env\n",
+    "print(collect_env())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pXf7oV5DWdab"
+   },
+   "source": [
+    "## Perform inference with a MMAction2 recognizer\n",
+    "MMAction2 already provides high level APIs to do inference and training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "64CW6d_AaT-Q",
+    "outputId": "ea330d8c-2e20-4dbd-d046-51d7c9ec4f7a"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "--2023-05-15 03:33:08--  https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth\n",
+      "Resolving download.openmmlab.com (download.openmmlab.com)... 163.181.82.216, 163.181.82.218, 163.181.82.213, ...\n",
+      "Connecting to download.openmmlab.com (download.openmmlab.com)|163.181.82.216|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 97579339 (93M) [application/octet-stream]\n",
+      "Saving to: ‘checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth’\n",
+      "\n",
+      "checkpoints/tsn_r50 100%[===================>]  93.06M  26.1MB/s    in 3.6s    \n",
+      "\n",
+      "2023-05-15 03:33:12 (26.2 MB/s) - ‘checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth’ saved [97579339/97579339]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir checkpoints\n",
+    "!wget -c https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \\\n",
+    "      -O checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "HNZB7NoSabzj",
+    "outputId": "c0c2ba71-72ff-4cac-a5b8-65590f5a6bb0"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Loads checkpoint by local backend from path: checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "from mmaction.apis import inference_recognizer, init_recognizer\n",
+    "from mmengine import Config\n",
+    "\n",
+    "\n",
+    "# Choose to use a config and initialize the recognizer\n",
+    "config = 'configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'\n",
+    "config = Config.fromfile(config)\n",
+    "# Setup a checkpoint file to load\n",
+    "checkpoint = 'checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth'\n",
+    "# Initialize the recognizer\n",
+    "model = init_recognizer(config, checkpoint, device='cuda:0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "id": "rEMsBnpHapAn",
+    "outputId": "ec05049e-7289-4798-94fa-2b773cb23634",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    }
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "05/15 03:33:18 - mmengine - WARNING - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n",
+      "05/15 03:33:18 - mmengine - WARNING - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Use the recognizer to do inference\n",
+    "from operator import itemgetter\n",
+    "video = 'demo/demo.mp4'\n",
+    "label = 'tools/data/kinetics/label_map_k400.txt'\n",
+    "results = inference_recognizer(model, video)\n",
+    "\n",
+    "pred_scores = results.pred_score.tolist()\n",
+    "score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))\n",
+    "score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)\n",
+    "top5_label = score_sorted[:5]\n",
+    "\n",
+    "labels = open(label).readlines()\n",
+    "labels = [x.strip() for x in labels]\n",
+    "results = [(labels[k[0]], k[1]) for k in top5_label]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "NIyJXqfWathq",
+    "outputId": "cb25aca9-e72d-4c54-f295-4c889713cb3a"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "The top-5 labels with corresponding scores are:\n",
+      "arm wrestling:  1.0\n",
+      "rock scissors paper:  6.434453414527752e-09\n",
+      "shaking hands:  2.7599860175087088e-09\n",
+      "clapping:  1.3454612979302283e-09\n",
+      "massaging feet:  5.555100823784187e-10\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('The top-5 labels with corresponding scores are:')\n",
+    "for result in results:\n",
+    "    print(f'{result[0]}: ', result[1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "QuZG8kZ2fJ5d"
+   },
+   "source": [
+    "## Train a recognizer on customized dataset\n",
+    "\n",
+    "To train a new recognizer, there are usually three things to do:\n",
+    "1. Support a new dataset\n",
+    "2. Modify the config\n",
+    "3. Train a new recognizer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "neEFyxChfgiJ"
+   },
+   "source": [
+    "### Support a new dataset\n",
+    "\n",
+    "In this tutorial, we gives an example to convert the data into the format of existing datasets. Other methods and more advanced usages can be found in the [doc](/docs/tutorials/new_dataset.md)\n",
+    "\n",
+    "Firstly, let's download a tiny dataset obtained from [Kinetics-400](https://deepmind.com/research/open-source/open-source-datasets/kinetics/). We select 30 videos with their labels as train dataset and 10 videos with their labels as test dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "gjsUj9JzgUlJ",
+    "outputId": "96a0e6e9-0dd8-4c07-9fed-22b93d5c1318"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "rm: cannot remove 'kinetics400_tiny.zip*': No such file or directory\n",
+      "--2023-05-15 03:33:27--  https://download.openmmlab.com/mmaction/kinetics400_tiny.zip\n",
+      "Resolving download.openmmlab.com (download.openmmlab.com)... 163.181.82.216, 163.181.82.218, 163.181.82.213, ...\n",
+      "Connecting to download.openmmlab.com (download.openmmlab.com)|163.181.82.216|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 18308682 (17M) [application/zip]\n",
+      "Saving to: ‘kinetics400_tiny.zip’\n",
+      "\n",
+      "kinetics400_tiny.zi 100%[===================>]  17.46M  32.7MB/s    in 0.5s    \n",
+      "\n",
+      "2023-05-15 03:33:28 (32.7 MB/s) - ‘kinetics400_tiny.zip’ saved [18308682/18308682]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# download, decompress the data\n",
+    "!rm kinetics400_tiny.zip*\n",
+    "!rm -rf kinetics400_tiny\n",
+    "!wget https://download.openmmlab.com/mmaction/kinetics400_tiny.zip\n",
+    "!unzip kinetics400_tiny.zip > /dev/null"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "AbZ-o7V6hNw4",
+    "outputId": "f229f352-1b43-41b7-a374-21404f618581"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Reading package lists...\n",
+      "Building dependency tree...\n",
+      "Reading state information...\n",
+      "The following NEW packages will be installed:\n",
+      "  tree\n",
+      "0 upgraded, 1 newly installed, 0 to remove and 24 not upgraded.\n",
+      "Need to get 43.0 kB of archives.\n",
+      "After this operation, 115 kB of additional disk space will be used.\n",
+      "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tree amd64 1.8.0-1 [43.0 kB]\n",
+      "Fetched 43.0 kB in 1s (48.9 kB/s)\n",
+      "Selecting previously unselected package tree.\n",
+      "(Reading database ... 122519 files and directories currently installed.)\n",
+      "Preparing to unpack .../tree_1.8.0-1_amd64.deb ...\n",
+      "Unpacking tree (1.8.0-1) ...\n",
+      "Setting up tree (1.8.0-1) ...\n",
+      "Processing triggers for man-db (2.9.1-1) ...\n",
+      "\u001B[01;34mkinetics400_tiny\u001B[00m\n",
+      "├── kinetics_tiny_train_video.txt\n",
+      "├── kinetics_tiny_val_video.txt\n",
+      "├── \u001B[01;34mtrain\u001B[00m\n",
+      "│   ├── 27_CSXByd3s.mp4\n",
+      "│   ├── 34XczvTaRiI.mp4\n",
+      "│   ├── A-wiliK50Zw.mp4\n",
+      "│   ├── D32_1gwq35E.mp4\n",
+      "│   ├── D92m0HsHjcQ.mp4\n",
+      "│   ├── DbX8mPslRXg.mp4\n",
+      "│   ├── FMlSTTpN3VY.mp4\n",
+      "│   ├── h10B9SVE-nk.mp4\n",
+      "│   ├── h2YqqUhnR34.mp4\n",
+      "│   ├── iRuyZSKhHRg.mp4\n",
+      "│   ├── IyfILH9lBRo.mp4\n",
+      "│   ├── kFC3KY2bOP8.mp4\n",
+      "│   ├── LvcFDgCAXQs.mp4\n",
+      "│   ├── O46YA8tI530.mp4\n",
+      "│   ├── oMrZaozOvdQ.mp4\n",
+      "│   ├── oXy-e_P_cAI.mp4\n",
+      "│   ├── P5M-hAts7MQ.mp4\n",
+      "│   ├── phDqGd0NKoo.mp4\n",
+      "│   ├── PnOe3GZRVX8.mp4\n",
+      "│   ├── R8HXQkdgKWA.mp4\n",
+      "│   ├── RqnKtCEoEcA.mp4\n",
+      "│   ├── soEcZZsBmDs.mp4\n",
+      "│   ├── TkkZPZHbAKA.mp4\n",
+      "│   ├── T_TMNGzVrDk.mp4\n",
+      "│   ├── WaS0qwP46Us.mp4\n",
+      "│   ├── Wh_YPQdH1Zg.mp4\n",
+      "│   ├── WWP5HZJsg-o.mp4\n",
+      "│   ├── xGY2dP0YUjA.mp4\n",
+      "│   ├── yLC9CtWU5ws.mp4\n",
+      "│   └── ZQV4U2KQ370.mp4\n",
+      "└── \u001B[01;34mval\u001B[00m\n",
+      "    ├── 0pVGiAU6XEA.mp4\n",
+      "    ├── AQrbRSnRt8M.mp4\n",
+      "    ├── b6Q_b7vgc7Q.mp4\n",
+      "    ├── ddvJ6-faICE.mp4\n",
+      "    ├── IcLztCtvhb8.mp4\n",
+      "    ├── ik4BW3-SCts.mp4\n",
+      "    ├── jqRrH30V0k4.mp4\n",
+      "    ├── SU_x2LQqSLs.mp4\n",
+      "    ├── u4Rm6srmIS8.mp4\n",
+      "    └── y5Iu7XkTqV0.mp4\n",
+      "\n",
+      "2 directories, 42 files\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check the directory structure of the tiny data\n",
+    "\n",
+    "# Install tree first\n",
+    "!apt-get -q install tree\n",
+    "!tree kinetics400_tiny"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "fTdi6dI0hY3g",
+    "outputId": "95f22438-566c-4496-fe0c-50e128b47b5e"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "D32_1gwq35E.mp4 0\n",
+      "iRuyZSKhHRg.mp4 1\n",
+      "oXy-e_P_cAI.mp4 0\n",
+      "34XczvTaRiI.mp4 1\n",
+      "h2YqqUhnR34.mp4 0\n",
+      "O46YA8tI530.mp4 0\n",
+      "kFC3KY2bOP8.mp4 1\n",
+      "WWP5HZJsg-o.mp4 1\n",
+      "phDqGd0NKoo.mp4 1\n",
+      "yLC9CtWU5ws.mp4 0\n",
+      "27_CSXByd3s.mp4 1\n",
+      "IyfILH9lBRo.mp4 1\n",
+      "T_TMNGzVrDk.mp4 1\n",
+      "TkkZPZHbAKA.mp4 0\n",
+      "PnOe3GZRVX8.mp4 1\n",
+      "soEcZZsBmDs.mp4 1\n",
+      "FMlSTTpN3VY.mp4 1\n",
+      "WaS0qwP46Us.mp4 0\n",
+      "A-wiliK50Zw.mp4 1\n",
+      "oMrZaozOvdQ.mp4 1\n",
+      "ZQV4U2KQ370.mp4 0\n",
+      "DbX8mPslRXg.mp4 1\n",
+      "h10B9SVE-nk.mp4 1\n",
+      "P5M-hAts7MQ.mp4 0\n",
+      "R8HXQkdgKWA.mp4 0\n",
+      "D92m0HsHjcQ.mp4 0\n",
+      "RqnKtCEoEcA.mp4 0\n",
+      "LvcFDgCAXQs.mp4 0\n",
+      "xGY2dP0YUjA.mp4 0\n",
+      "Wh_YPQdH1Zg.mp4 0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# After downloading the data, we need to check the annotation format\n",
+    "!cat kinetics400_tiny/kinetics_tiny_train_video.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0bq0mxmEi29H"
+   },
+   "source": [
+    "According to the format defined in [`VideoDataset`](./datasets/video_dataset.py), each line indicates a sample video with the filepath and label, which are split with a whitespace."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Ht_DGJA9jQar"
+   },
+   "source": [
+    "### Modify the config\n",
+    "\n",
+    "In the next step, we need to modify the config for the training.\n",
+    "To accelerate the process, we finetune a recognizer using a pre-trained recognizer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "id": "LjCcmCKOjktc"
+   },
+   "outputs": [],
+   "source": [
+    "cfg = Config.fromfile('./configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "tc8YhFFGjp3e"
+   },
+   "source": [
+    "Given a config that trains a TSN model on kinetics400-full dataset, we need to modify some values to use it for training TSN on Kinetics400-tiny dataset.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "tlhu9byjjt-K",
+    "outputId": "2d984a1d-93f7-493f-fd77-e19af8285f38"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Config:\n",
+      "model = dict(\n",
+      "    type='Recognizer2D',\n",
+      "    backbone=dict(\n",
+      "        type='ResNet',\n",
+      "        pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth',\n",
+      "        depth=50,\n",
+      "        norm_eval=False),\n",
+      "    cls_head=dict(\n",
+      "        type='TSNHead',\n",
+      "        num_classes=2,\n",
+      "        in_channels=2048,\n",
+      "        spatial_type='avg',\n",
+      "        consensus=dict(type='AvgConsensus', dim=1),\n",
+      "        dropout_ratio=0.4,\n",
+      "        init_std=0.01,\n",
+      "        average_clips='prob'),\n",
+      "    data_preprocessor=dict(\n",
+      "        type='ActionDataPreprocessor',\n",
+      "        mean=[123.675, 116.28, 103.53],\n",
+      "        std=[58.395, 57.12, 57.375],\n",
+      "        format_shape='NCHW'),\n",
+      "    train_cfg=None,\n",
+      "    test_cfg=None)\n",
+      "train_cfg = dict(\n",
+      "    type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1)\n",
+      "val_cfg = dict(type='ValLoop')\n",
+      "test_cfg = dict(type='TestLoop')\n",
+      "param_scheduler = [\n",
+      "    dict(\n",
+      "        type='MultiStepLR',\n",
+      "        begin=0,\n",
+      "        end=100,\n",
+      "        by_epoch=True,\n",
+      "        milestones=[40, 80],\n",
+      "        gamma=0.1)\n",
+      "]\n",
+      "optim_wrapper = dict(\n",
+      "    optimizer=dict(\n",
+      "        type='SGD', lr=7.8125e-05, momentum=0.9, weight_decay=0.0001),\n",
+      "    clip_grad=dict(max_norm=40, norm_type=2))\n",
+      "default_scope = 'mmaction'\n",
+      "default_hooks = dict(\n",
+      "    runtime_info=dict(type='RuntimeInfoHook'),\n",
+      "    timer=dict(type='IterTimerHook'),\n",
+      "    logger=dict(type='LoggerHook', interval=20, ignore_last=False),\n",
+      "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+      "    checkpoint=dict(\n",
+      "        type='CheckpointHook', interval=3, save_best='auto', max_keep_ckpts=3),\n",
+      "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+      "    sync_buffers=dict(type='SyncBuffersHook'))\n",
+      "env_cfg = dict(\n",
+      "    cudnn_benchmark=False,\n",
+      "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+      "    dist_cfg=dict(backend='nccl'))\n",
+      "log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)\n",
+      "vis_backends = [dict(type='LocalVisBackend')]\n",
+      "visualizer = dict(\n",
+      "    type='ActionVisualizer', vis_backends=[dict(type='LocalVisBackend')])\n",
+      "log_level = 'INFO'\n",
+      "load_from = './checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth'\n",
+      "resume = False\n",
+      "dataset_type = 'VideoDataset'\n",
+      "data_root = 'kinetics400_tiny/train/'\n",
+      "data_root_val = 'kinetics400_tiny/val/'\n",
+      "ann_file_train = 'kinetics400_tiny/kinetics_tiny_train_video.txt'\n",
+      "ann_file_val = 'kinetics400_tiny/kinetics_tiny_val_video.txt'\n",
+      "file_client_args = dict(io_backend='disk')\n",
+      "train_pipeline = [\n",
+      "    dict(type='DecordInit', io_backend='disk'),\n",
+      "    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),\n",
+      "    dict(type='DecordDecode'),\n",
+      "    dict(type='Resize', scale=(-1, 256)),\n",
+      "    dict(\n",
+      "        type='MultiScaleCrop',\n",
+      "        input_size=224,\n",
+      "        scales=(1, 0.875, 0.75, 0.66),\n",
+      "        random_crop=False,\n",
+      "        max_wh_scale_gap=1),\n",
+      "    dict(type='Resize', scale=(224, 224), keep_ratio=False),\n",
+      "    dict(type='Flip', flip_ratio=0.5),\n",
+      "    dict(type='FormatShape', input_format='NCHW'),\n",
+      "    dict(type='PackActionInputs')\n",
+      "]\n",
+      "val_pipeline = [\n",
+      "    dict(type='DecordInit', io_backend='disk'),\n",
+      "    dict(\n",
+      "        type='SampleFrames',\n",
+      "        clip_len=1,\n",
+      "        frame_interval=1,\n",
+      "        num_clips=3,\n",
+      "        test_mode=True),\n",
+      "    dict(type='DecordDecode'),\n",
+      "    dict(type='Resize', scale=(-1, 256)),\n",
+      "    dict(type='CenterCrop', crop_size=224),\n",
+      "    dict(type='FormatShape', input_format='NCHW'),\n",
+      "    dict(type='PackActionInputs')\n",
+      "]\n",
+      "test_pipeline = [\n",
+      "    dict(type='DecordInit', io_backend='disk'),\n",
+      "    dict(\n",
+      "        type='SampleFrames',\n",
+      "        clip_len=1,\n",
+      "        frame_interval=1,\n",
+      "        num_clips=25,\n",
+      "        test_mode=True),\n",
+      "    dict(type='DecordDecode'),\n",
+      "    dict(type='Resize', scale=(-1, 256)),\n",
+      "    dict(type='TenCrop', crop_size=224),\n",
+      "    dict(type='FormatShape', input_format='NCHW'),\n",
+      "    dict(type='PackActionInputs')\n",
+      "]\n",
+      "train_dataloader = dict(\n",
+      "    batch_size=2,\n",
+      "    num_workers=2,\n",
+      "    persistent_workers=True,\n",
+      "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+      "    dataset=dict(\n",
+      "        type='VideoDataset',\n",
+      "        ann_file='kinetics400_tiny/kinetics_tiny_train_video.txt',\n",
+      "        data_prefix=dict(video='kinetics400_tiny/train/'),\n",
+      "        pipeline=[\n",
+      "            dict(type='DecordInit', io_backend='disk'),\n",
+      "            dict(\n",
+      "                type='SampleFrames', clip_len=1, frame_interval=1,\n",
+      "                num_clips=3),\n",
+      "            dict(type='DecordDecode'),\n",
+      "            dict(type='Resize', scale=(-1, 256)),\n",
+      "            dict(\n",
+      "                type='MultiScaleCrop',\n",
+      "                input_size=224,\n",
+      "                scales=(1, 0.875, 0.75, 0.66),\n",
+      "                random_crop=False,\n",
+      "                max_wh_scale_gap=1),\n",
+      "            dict(type='Resize', scale=(224, 224), keep_ratio=False),\n",
+      "            dict(type='Flip', flip_ratio=0.5),\n",
+      "            dict(type='FormatShape', input_format='NCHW'),\n",
+      "            dict(type='PackActionInputs')\n",
+      "        ]))\n",
+      "val_dataloader = dict(\n",
+      "    batch_size=2,\n",
+      "    num_workers=2,\n",
+      "    persistent_workers=True,\n",
+      "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+      "    dataset=dict(\n",
+      "        type='VideoDataset',\n",
+      "        ann_file='kinetics400_tiny/kinetics_tiny_val_video.txt',\n",
+      "        data_prefix=dict(video='kinetics400_tiny/val/'),\n",
+      "        pipeline=[\n",
+      "            dict(type='DecordInit', io_backend='disk'),\n",
+      "            dict(\n",
+      "                type='SampleFrames',\n",
+      "                clip_len=1,\n",
+      "                frame_interval=1,\n",
+      "                num_clips=3,\n",
+      "                test_mode=True),\n",
+      "            dict(type='DecordDecode'),\n",
+      "            dict(type='Resize', scale=(-1, 256)),\n",
+      "            dict(type='CenterCrop', crop_size=224),\n",
+      "            dict(type='FormatShape', input_format='NCHW'),\n",
+      "            dict(type='PackActionInputs')\n",
+      "        ],\n",
+      "        test_mode=True))\n",
+      "test_dataloader = dict(\n",
+      "    batch_size=1,\n",
+      "    num_workers=2,\n",
+      "    persistent_workers=True,\n",
+      "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+      "    dataset=dict(\n",
+      "        type='VideoDataset',\n",
+      "        ann_file='kinetics400_tiny/kinetics_tiny_val_video.txt',\n",
+      "        data_prefix=dict(video='kinetics400_tiny/val/'),\n",
+      "        pipeline=[\n",
+      "            dict(type='DecordInit', io_backend='disk'),\n",
+      "            dict(\n",
+      "                type='SampleFrames',\n",
+      "                clip_len=1,\n",
+      "                frame_interval=1,\n",
+      "                num_clips=25,\n",
+      "                test_mode=True),\n",
+      "            dict(type='DecordDecode'),\n",
+      "            dict(type='Resize', scale=(-1, 256)),\n",
+      "            dict(type='TenCrop', crop_size=224),\n",
+      "            dict(type='FormatShape', input_format='NCHW'),\n",
+      "            dict(type='PackActionInputs')\n",
+      "        ],\n",
+      "        test_mode=True))\n",
+      "val_evaluator = dict(type='AccMetric')\n",
+      "test_evaluator = dict(type='AccMetric')\n",
+      "auto_scale_lr = dict(enable=False, base_batch_size=256)\n",
+      "work_dir = './tutorial_exps'\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from mmengine.runner import set_random_seed\n",
+    "\n",
+    "# Modify dataset type and path\n",
+    "cfg.data_root = 'kinetics400_tiny/train/'\n",
+    "cfg.data_root_val = 'kinetics400_tiny/val/'\n",
+    "cfg.ann_file_train = 'kinetics400_tiny/kinetics_tiny_train_video.txt'\n",
+    "cfg.ann_file_val = 'kinetics400_tiny/kinetics_tiny_val_video.txt'\n",
+    "\n",
+    "\n",
+    "cfg.test_dataloader.dataset.ann_file = 'kinetics400_tiny/kinetics_tiny_val_video.txt'\n",
+    "cfg.test_dataloader.dataset.data_prefix.video = 'kinetics400_tiny/val/'\n",
+    "\n",
+    "cfg.train_dataloader.dataset.ann_file = 'kinetics400_tiny/kinetics_tiny_train_video.txt'\n",
+    "cfg.train_dataloader.dataset.data_prefix.video = 'kinetics400_tiny/train/'\n",
+    "\n",
+    "cfg.val_dataloader.dataset.ann_file = 'kinetics400_tiny/kinetics_tiny_val_video.txt'\n",
+    "cfg.val_dataloader.dataset.data_prefix.video  = 'kinetics400_tiny/val/'\n",
+    "\n",
+    "\n",
+    "# Modify num classes of the model in cls_head\n",
+    "cfg.model.cls_head.num_classes = 2\n",
+    "# We can use the pre-trained TSN model\n",
+    "cfg.load_from = './checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth'\n",
+    "\n",
+    "# Set up working dir to save files and logs.\n",
+    "cfg.work_dir = './tutorial_exps'\n",
+    "\n",
+    "# The original learning rate (LR) is set for 8-GPU training.\n",
+    "# We divide it by 8 since we only use one GPU.\n",
+    "cfg.train_dataloader.batch_size = cfg.train_dataloader.batch_size // 16\n",
+    "cfg.val_dataloader.batch_size = cfg.val_dataloader.batch_size // 16\n",
+    "cfg.optim_wrapper.optimizer.lr = cfg.optim_wrapper.optimizer.lr / 8 / 16\n",
+    "cfg.train_cfg.max_epochs = 10\n",
+    "\n",
+    "cfg.train_dataloader.num_workers = 2\n",
+    "cfg.val_dataloader.num_workers = 2\n",
+    "cfg.test_dataloader.num_workers = 2\n",
+    "\n",
+    "# We can initialize the logger for training and have a look\n",
+    "# at the final config used for training\n",
+    "print(f'Config:\\n{cfg.pretty_text}')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "tES-qnZ3k38Z"
+   },
+   "source": [
+    "### Train a new recognizer\n",
+    "\n",
+    "Finally, lets initialize the dataset and recognizer, then train a new recognizer!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "dDBWkdDRk6oz",
+    "outputId": "044b9e09-2038-41c9-d5a3-8a74ae11ade2"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "05/15 03:33:34 - mmengine - INFO - \n",
+      "------------------------------------------------------------\n",
+      "System environment:\n",
+      "    sys.platform: linux\n",
+      "    Python: 3.10.11 (main, Apr  5 2023, 14:15:10) [GCC 9.4.0]\n",
+      "    CUDA available: True\n",
+      "    numpy_random_seed: 1853452922\n",
+      "    GPU 0: Tesla T4\n",
+      "    CUDA_HOME: /usr/local/cuda\n",
+      "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+      "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+      "    PyTorch: 2.0.0+cu118\n",
+      "    PyTorch compiling details: PyTorch built with:\n",
+      "  - GCC 9.3\n",
+      "  - C++ Version: 201703\n",
+      "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+      "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+      "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+      "  - LAPACK is enabled (usually provided by MKL)\n",
+      "  - NNPACK is enabled\n",
+      "  - CPU capability usage: AVX2\n",
+      "  - CUDA Runtime 11.8\n",
+      "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+      "  - CuDNN 8.7\n",
+      "  - Magma 2.6.1\n",
+      "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+      "\n",
+      "    TorchVision: 0.15.1+cu118\n",
+      "    OpenCV: 4.7.0\n",
+      "    MMEngine: 0.7.3\n",
+      "\n",
+      "Runtime environment:\n",
+      "    cudnn_benchmark: False\n",
+      "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+      "    dist_cfg: {'backend': 'nccl'}\n",
+      "    seed: None\n",
+      "    Distributed launcher: none\n",
+      "    Distributed training: False\n",
+      "    GPU number: 1\n",
+      "------------------------------------------------------------\n",
+      "\n",
+      "05/15 03:33:34 - mmengine - INFO - Config:\n",
+      "model = dict(\n",
+      "    type='Recognizer2D',\n",
+      "    backbone=dict(\n",
+      "        type='ResNet',\n",
+      "        pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth',\n",
+      "        depth=50,\n",
+      "        norm_eval=False),\n",
+      "    cls_head=dict(\n",
+      "        type='TSNHead',\n",
+      "        num_classes=2,\n",
+      "        in_channels=2048,\n",
+      "        spatial_type='avg',\n",
+      "        consensus=dict(type='AvgConsensus', dim=1),\n",
+      "        dropout_ratio=0.4,\n",
+      "        init_std=0.01,\n",
+      "        average_clips='prob'),\n",
+      "    data_preprocessor=dict(\n",
+      "        type='ActionDataPreprocessor',\n",
+      "        mean=[123.675, 116.28, 103.53],\n",
+      "        std=[58.395, 57.12, 57.375],\n",
+      "        format_shape='NCHW'),\n",
+      "    train_cfg=None,\n",
+      "    test_cfg=None)\n",
+      "train_cfg = dict(\n",
+      "    type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1)\n",
+      "val_cfg = dict(type='ValLoop')\n",
+      "test_cfg = dict(type='TestLoop')\n",
+      "param_scheduler = [\n",
+      "    dict(\n",
+      "        type='MultiStepLR',\n",
+      "        begin=0,\n",
+      "        end=100,\n",
+      "        by_epoch=True,\n",
+      "        milestones=[40, 80],\n",
+      "        gamma=0.1)\n",
+      "]\n",
+      "optim_wrapper = dict(\n",
+      "    optimizer=dict(\n",
+      "        type='SGD', lr=7.8125e-05, momentum=0.9, weight_decay=0.0001),\n",
+      "    clip_grad=dict(max_norm=40, norm_type=2))\n",
+      "default_scope = 'mmaction'\n",
+      "default_hooks = dict(\n",
+      "    runtime_info=dict(type='RuntimeInfoHook'),\n",
+      "    timer=dict(type='IterTimerHook'),\n",
+      "    logger=dict(type='LoggerHook', interval=20, ignore_last=False),\n",
+      "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+      "    checkpoint=dict(\n",
+      "        type='CheckpointHook', interval=3, save_best='auto', max_keep_ckpts=3),\n",
+      "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+      "    sync_buffers=dict(type='SyncBuffersHook'))\n",
+      "env_cfg = dict(\n",
+      "    cudnn_benchmark=False,\n",
+      "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+      "    dist_cfg=dict(backend='nccl'))\n",
+      "log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)\n",
+      "vis_backends = [dict(type='LocalVisBackend')]\n",
+      "visualizer = dict(\n",
+      "    type='ActionVisualizer', vis_backends=[dict(type='LocalVisBackend')])\n",
+      "log_level = 'INFO'\n",
+      "load_from = './checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth'\n",
+      "resume = False\n",
+      "dataset_type = 'VideoDataset'\n",
+      "data_root = 'kinetics400_tiny/train/'\n",
+      "data_root_val = 'kinetics400_tiny/val/'\n",
+      "ann_file_train = 'kinetics400_tiny/kinetics_tiny_train_video.txt'\n",
+      "ann_file_val = 'kinetics400_tiny/kinetics_tiny_val_video.txt'\n",
+      "file_client_args = dict(io_backend='disk')\n",
+      "train_pipeline = [\n",
+      "    dict(type='DecordInit', io_backend='disk'),\n",
+      "    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),\n",
+      "    dict(type='DecordDecode'),\n",
+      "    dict(type='Resize', scale=(-1, 256)),\n",
+      "    dict(\n",
+      "        type='MultiScaleCrop',\n",
+      "        input_size=224,\n",
+      "        scales=(1, 0.875, 0.75, 0.66),\n",
+      "        random_crop=False,\n",
+      "        max_wh_scale_gap=1),\n",
+      "    dict(type='Resize', scale=(224, 224), keep_ratio=False),\n",
+      "    dict(type='Flip', flip_ratio=0.5),\n",
+      "    dict(type='FormatShape', input_format='NCHW'),\n",
+      "    dict(type='PackActionInputs')\n",
+      "]\n",
+      "val_pipeline = [\n",
+      "    dict(type='DecordInit', io_backend='disk'),\n",
+      "    dict(\n",
+      "        type='SampleFrames',\n",
+      "        clip_len=1,\n",
+      "        frame_interval=1,\n",
+      "        num_clips=3,\n",
+      "        test_mode=True),\n",
+      "    dict(type='DecordDecode'),\n",
+      "    dict(type='Resize', scale=(-1, 256)),\n",
+      "    dict(type='CenterCrop', crop_size=224),\n",
+      "    dict(type='FormatShape', input_format='NCHW'),\n",
+      "    dict(type='PackActionInputs')\n",
+      "]\n",
+      "test_pipeline = [\n",
+      "    dict(type='DecordInit', io_backend='disk'),\n",
+      "    dict(\n",
+      "        type='SampleFrames',\n",
+      "        clip_len=1,\n",
+      "        frame_interval=1,\n",
+      "        num_clips=25,\n",
+      "        test_mode=True),\n",
+      "    dict(type='DecordDecode'),\n",
+      "    dict(type='Resize', scale=(-1, 256)),\n",
+      "    dict(type='TenCrop', crop_size=224),\n",
+      "    dict(type='FormatShape', input_format='NCHW'),\n",
+      "    dict(type='PackActionInputs')\n",
+      "]\n",
+      "train_dataloader = dict(\n",
+      "    batch_size=2,\n",
+      "    num_workers=2,\n",
+      "    persistent_workers=True,\n",
+      "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+      "    dataset=dict(\n",
+      "        type='VideoDataset',\n",
+      "        ann_file='kinetics400_tiny/kinetics_tiny_train_video.txt',\n",
+      "        data_prefix=dict(video='kinetics400_tiny/train/'),\n",
+      "        pipeline=[\n",
+      "            dict(type='DecordInit', io_backend='disk'),\n",
+      "            dict(\n",
+      "                type='SampleFrames', clip_len=1, frame_interval=1,\n",
+      "                num_clips=3),\n",
+      "            dict(type='DecordDecode'),\n",
+      "            dict(type='Resize', scale=(-1, 256)),\n",
+      "            dict(\n",
+      "                type='MultiScaleCrop',\n",
+      "                input_size=224,\n",
+      "                scales=(1, 0.875, 0.75, 0.66),\n",
+      "                random_crop=False,\n",
+      "                max_wh_scale_gap=1),\n",
+      "            dict(type='Resize', scale=(224, 224), keep_ratio=False),\n",
+      "            dict(type='Flip', flip_ratio=0.5),\n",
+      "            dict(type='FormatShape', input_format='NCHW'),\n",
+      "            dict(type='PackActionInputs')\n",
+      "        ]))\n",
+      "val_dataloader = dict(\n",
+      "    batch_size=2,\n",
+      "    num_workers=2,\n",
+      "    persistent_workers=True,\n",
+      "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+      "    dataset=dict(\n",
+      "        type='VideoDataset',\n",
+      "        ann_file='kinetics400_tiny/kinetics_tiny_val_video.txt',\n",
+      "        data_prefix=dict(video='kinetics400_tiny/val/'),\n",
+      "        pipeline=[\n",
+      "            dict(type='DecordInit', io_backend='disk'),\n",
+      "            dict(\n",
+      "                type='SampleFrames',\n",
+      "                clip_len=1,\n",
+      "                frame_interval=1,\n",
+      "                num_clips=3,\n",
+      "                test_mode=True),\n",
+      "            dict(type='DecordDecode'),\n",
+      "            dict(type='Resize', scale=(-1, 256)),\n",
+      "            dict(type='CenterCrop', crop_size=224),\n",
+      "            dict(type='FormatShape', input_format='NCHW'),\n",
+      "            dict(type='PackActionInputs')\n",
+      "        ],\n",
+      "        test_mode=True))\n",
+      "test_dataloader = dict(\n",
+      "    batch_size=1,\n",
+      "    num_workers=2,\n",
+      "    persistent_workers=True,\n",
+      "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+      "    dataset=dict(\n",
+      "        type='VideoDataset',\n",
+      "        ann_file='kinetics400_tiny/kinetics_tiny_val_video.txt',\n",
+      "        data_prefix=dict(video='kinetics400_tiny/val/'),\n",
+      "        pipeline=[\n",
+      "            dict(type='DecordInit', io_backend='disk'),\n",
+      "            dict(\n",
+      "                type='SampleFrames',\n",
+      "                clip_len=1,\n",
+      "                frame_interval=1,\n",
+      "                num_clips=25,\n",
+      "                test_mode=True),\n",
+      "            dict(type='DecordDecode'),\n",
+      "            dict(type='Resize', scale=(-1, 256)),\n",
+      "            dict(type='TenCrop', crop_size=224),\n",
+      "            dict(type='FormatShape', input_format='NCHW'),\n",
+      "            dict(type='PackActionInputs')\n",
+      "        ],\n",
+      "        test_mode=True))\n",
+      "val_evaluator = dict(type='AccMetric')\n",
+      "test_evaluator = dict(type='AccMetric')\n",
+      "auto_scale_lr = dict(enable=False, base_batch_size=256)\n",
+      "work_dir = './tutorial_exps'\n",
+      "\n",
+      "05/15 03:33:35 - mmengine - INFO - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+      "05/15 03:33:35 - mmengine - INFO - Hooks will be executed in the following order:\n",
+      "before_run:\n",
+      "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+      "(BELOW_NORMAL) LoggerHook                         \n",
+      " -------------------- \n",
+      "before_train:\n",
+      "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      "(VERY_LOW    ) CheckpointHook                     \n",
+      " -------------------- \n",
+      "before_train_epoch:\n",
+      "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      "(NORMAL      ) DistSamplerSeedHook                \n",
+      " -------------------- \n",
+      "before_train_iter:\n",
+      "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      " -------------------- \n",
+      "after_train_iter:\n",
+      "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      "(BELOW_NORMAL) LoggerHook                         \n",
+      "(LOW         ) ParamSchedulerHook                 \n",
+      "(VERY_LOW    ) CheckpointHook                     \n",
+      " -------------------- \n",
+      "after_train_epoch:\n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      "(NORMAL      ) SyncBuffersHook                    \n",
+      "(LOW         ) ParamSchedulerHook                 \n",
+      "(VERY_LOW    ) CheckpointHook                     \n",
+      " -------------------- \n",
+      "before_val_epoch:\n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      "(NORMAL      ) SyncBuffersHook                    \n",
+      " -------------------- \n",
+      "before_val_iter:\n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      " -------------------- \n",
+      "after_val_iter:\n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      "(BELOW_NORMAL) LoggerHook                         \n",
+      " -------------------- \n",
+      "after_val_epoch:\n",
+      "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      "(BELOW_NORMAL) LoggerHook                         \n",
+      "(LOW         ) ParamSchedulerHook                 \n",
+      "(VERY_LOW    ) CheckpointHook                     \n",
+      " -------------------- \n",
+      "after_train:\n",
+      "(VERY_LOW    ) CheckpointHook                     \n",
+      " -------------------- \n",
+      "before_test_epoch:\n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      " -------------------- \n",
+      "before_test_iter:\n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      " -------------------- \n",
+      "after_test_iter:\n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      "(BELOW_NORMAL) LoggerHook                         \n",
+      " -------------------- \n",
+      "after_test_epoch:\n",
+      "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+      "(NORMAL      ) IterTimerHook                      \n",
+      "(BELOW_NORMAL) LoggerHook                         \n",
+      " -------------------- \n",
+      "after_run:\n",
+      "(BELOW_NORMAL) LoggerHook                         \n",
+      " -------------------- \n",
+      "Loads checkpoint by http backend from path: https://download.pytorch.org/models/resnet50-11ad3fa6.pth\n"
+     ]
+    },
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": [
+      "Downloading: \"https://download.pytorch.org/models/resnet50-11ad3fa6.pth\" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth\n"
+     ]
+    },
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "05/15 03:33:37 - mmengine - INFO - These parameters in pretrained checkpoint are not loaded: {'fc.weight', 'fc.bias'}\n",
+      "Loads checkpoint by local backend from path: ./checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth\n",
+      "The model and loaded state dict do not match exactly\n",
+      "\n",
+      "size mismatch for cls_head.fc_cls.weight: copying a param with shape torch.Size([400, 2048]) from checkpoint, the shape in current model is torch.Size([2, 2048]).\n",
+      "size mismatch for cls_head.fc_cls.bias: copying a param with shape torch.Size([400]) from checkpoint, the shape in current model is torch.Size([2]).\n",
+      "05/15 03:33:37 - mmengine - INFO - Load checkpoint from ./checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth\n",
+      "05/15 03:33:37 - mmengine - WARNING - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n",
+      "05/15 03:33:37 - mmengine - INFO - Checkpoints will be saved to /content/mmaction2/tutorial_exps.\n",
+      "05/15 03:33:41 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n",
+      "05/15 03:33:41 - mmengine - INFO - Epoch(train)  [1][15/15]  lr: 7.8125e-05  eta: 0:00:31  time: 0.2334  data_time: 0.0793  memory: 2917  grad_norm: 11.9900  loss: 0.6971  top1_acc: 1.0000  top5_acc: 1.0000  loss_cls: 0.6971\n",
+      "05/15 03:33:42 - mmengine - INFO - Epoch(val) [1][5/5]    acc/top1: 0.3000  acc/top5: 1.0000  acc/mean1: 0.3000  data_time: 0.1994  time: 0.2254\n",
+      "05/15 03:33:42 - mmengine - INFO - The best checkpoint with 0.3000 acc/top1 at 1 epoch is saved to best_acc_top1_epoch_1.pth.\n",
+      "05/15 03:33:46 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n",
+      "05/15 03:33:46 - mmengine - INFO - Epoch(train)  [2][15/15]  lr: 7.8125e-05  eta: 0:00:29  time: 0.2373  data_time: 0.1369  memory: 961  grad_norm: 12.4935  loss: 0.7158  top1_acc: 0.5000  top5_acc: 1.0000  loss_cls: 0.7158\n",
+      "05/15 03:33:48 - mmengine - INFO - Epoch(val) [2][5/5]    acc/top1: 0.7000  acc/top5: 1.0000  acc/mean1: 0.7000  data_time: 0.2692  time: 0.3006\n",
+      "05/15 03:33:48 - mmengine - INFO - The previous best checkpoint /content/mmaction2/tutorial_exps/best_acc_top1_epoch_1.pth is removed\n",
+      "05/15 03:33:48 - mmengine - INFO - The best checkpoint with 0.7000 acc/top1 at 2 epoch is saved to best_acc_top1_epoch_2.pth.\n",
+      "05/15 03:33:51 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n",
+      "05/15 03:33:51 - mmengine - INFO - Epoch(train)  [3][15/15]  lr: 7.8125e-05  eta: 0:00:24  time: 0.2112  data_time: 0.1163  memory: 961  grad_norm: 13.4063  loss: 0.7338  top1_acc: 0.0000  top5_acc: 1.0000  loss_cls: 0.7338\n",
+      "05/15 03:33:51 - mmengine - INFO - Saving checkpoint at 3 epochs\n",
+      "05/15 03:33:53 - mmengine - INFO - Epoch(val) [3][5/5]    acc/top1: 0.4000  acc/top5: 1.0000  acc/mean1: 0.4000  data_time: 0.1669  time: 0.1906\n",
+      "05/15 03:33:56 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n",
+      "05/15 03:33:56 - mmengine - INFO - Epoch(train)  [4][15/15]  lr: 7.8125e-05  eta: 0:00:19  time: 0.1750  data_time: 0.0907  memory: 961  grad_norm: 12.4322  loss: 0.6894  top1_acc: 0.0000  top5_acc: 1.0000  loss_cls: 0.6894\n",
+      "05/15 03:33:57 - mmengine - INFO - Epoch(val) [4][5/5]    acc/top1: 0.7000  acc/top5: 1.0000  acc/mean1: 0.7000  data_time: 0.1791  time: 0.2030\n",
+      "05/15 03:34:00 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n",
+      "05/15 03:34:00 - mmengine - INFO - Epoch(train)  [5][15/15]  lr: 7.8125e-05  eta: 0:00:16  time: 0.2016  data_time: 0.1155  memory: 961  grad_norm: 11.5982  loss: 0.6940  top1_acc: 0.0000  top5_acc: 1.0000  loss_cls: 0.6940\n",
+      "05/15 03:34:02 - mmengine - INFO - Epoch(val) [5][5/5]    acc/top1: 0.7000  acc/top5: 1.0000  acc/mean1: 0.7000  data_time: 0.3145  time: 0.3455\n",
+      "05/15 03:34:05 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n",
+      "05/15 03:34:05 - mmengine - INFO - Epoch(train)  [6][15/15]  lr: 7.8125e-05  eta: 0:00:13  time: 0.2366  data_time: 0.1440  memory: 961  grad_norm: 12.0952  loss: 0.6667  top1_acc: 0.0000  top5_acc: 1.0000  loss_cls: 0.6667\n",
+      "05/15 03:34:05 - mmengine - INFO - Saving checkpoint at 6 epochs\n",
+      "05/15 03:34:08 - mmengine - INFO - Epoch(val) [6][5/5]    acc/top1: 0.6000  acc/top5: 1.0000  acc/mean1: 0.6000  data_time: 0.2172  time: 0.2403\n",
+      "05/15 03:34:10 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n",
+      "05/15 03:34:10 - mmengine - INFO - Epoch(train)  [7][15/15]  lr: 7.8125e-05  eta: 0:00:09  time: 0.1784  data_time: 0.0942  memory: 961  grad_norm: 12.4209  loss: 0.6570  top1_acc: 1.0000  top5_acc: 1.0000  loss_cls: 0.6570\n",
+      "05/15 03:34:11 - mmengine - INFO - Epoch(val) [7][5/5]    acc/top1: 0.9000  acc/top5: 1.0000  acc/mean1: 0.9000  data_time: 0.1898  time: 0.2118\n",
+      "05/15 03:34:11 - mmengine - INFO - The previous best checkpoint /content/mmaction2/tutorial_exps/best_acc_top1_epoch_2.pth is removed\n",
+      "05/15 03:34:12 - mmengine - INFO - The best checkpoint with 0.9000 acc/top1 at 7 epoch is saved to best_acc_top1_epoch_7.pth.\n",
+      "05/15 03:34:15 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n",
+      "05/15 03:34:15 - mmengine - INFO - Epoch(train)  [8][15/15]  lr: 7.8125e-05  eta: 0:00:06  time: 0.2073  data_time: 0.1220  memory: 961  grad_norm: 11.4271  loss: 0.6241  top1_acc: 1.0000  top5_acc: 1.0000  loss_cls: 0.6241\n",
+      "05/15 03:34:17 - mmengine - INFO - Epoch(val) [8][5/5]    acc/top1: 1.0000  acc/top5: 1.0000  acc/mean1: 1.0000  data_time: 0.3497  time: 0.3890\n",
+      "05/15 03:34:17 - mmengine - INFO - The previous best checkpoint /content/mmaction2/tutorial_exps/best_acc_top1_epoch_7.pth is removed\n",
+      "05/15 03:34:18 - mmengine - INFO - The best checkpoint with 1.0000 acc/top1 at 8 epoch is saved to best_acc_top1_epoch_8.pth.\n",
+      "05/15 03:34:21 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n",
+      "05/15 03:34:21 - mmengine - INFO - Epoch(train)  [9][15/15]  lr: 7.8125e-05  eta: 0:00:03  time: 0.2309  data_time: 0.1390  memory: 961  grad_norm: 12.3066  loss: 0.6451  top1_acc: 0.5000  top5_acc: 1.0000  loss_cls: 0.6451\n",
+      "05/15 03:34:21 - mmengine - INFO - Saving checkpoint at 9 epochs\n",
+      "05/15 03:34:23 - mmengine - INFO - Epoch(val) [9][5/5]    acc/top1: 1.0000  acc/top5: 1.0000  acc/mean1: 1.0000  data_time: 0.2023  time: 0.2256\n",
+      "05/15 03:34:26 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230515_033334\n",
+      "05/15 03:34:26 - mmengine - INFO - Epoch(train) [10][15/15]  lr: 7.8125e-05  eta: 0:00:00  time: 0.1733  data_time: 0.0951  memory: 961  grad_norm: 11.1461  loss: 0.5931  top1_acc: 1.0000  top5_acc: 1.0000  loss_cls: 0.5931\n",
+      "05/15 03:34:26 - mmengine - INFO - Saving checkpoint at 10 epochs\n",
+      "05/15 03:34:27 - mmengine - INFO - Epoch(val) [10][5/5]    acc/top1: 1.0000  acc/top5: 1.0000  acc/mean1: 1.0000  data_time: 0.1836  time: 0.2048\n"
+     ]
+    },
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "Recognizer2D(\n",
+       "  (data_preprocessor): ActionDataPreprocessor()\n",
+       "  (backbone): ResNet(\n",
+       "    (conv1): ConvModule(\n",
+       "      (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n",
+       "      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (activate): ReLU(inplace=True)\n",
+       "    )\n",
+       "    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n",
+       "    (layer1): Sequential(\n",
+       "      (0): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "        (downsample): ConvModule(\n",
+       "          (conv): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (1): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "      )\n",
+       "      (2): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (layer2): Sequential(\n",
+       "      (0): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "        (downsample): ConvModule(\n",
+       "          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+       "          (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (1): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "      )\n",
+       "      (2): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "      )\n",
+       "      (3): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (layer3): Sequential(\n",
+       "      (0): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "        (downsample): ConvModule(\n",
+       "          (conv): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+       "          (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (1): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "      )\n",
+       "      (2): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "      )\n",
+       "      (3): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "      )\n",
+       "      (4): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "      )\n",
+       "      (5): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (layer4): Sequential(\n",
+       "      (0): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "        (downsample): ConvModule(\n",
+       "          (conv): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+       "          (bn): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (1): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "      )\n",
+       "      (2): Bottleneck(\n",
+       "        (conv1): ConvModule(\n",
+       "          (conv): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv2): ConvModule(\n",
+       "          (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (activate): ReLU(inplace=True)\n",
+       "        )\n",
+       "        (conv3): ConvModule(\n",
+       "          (conv): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+       "          (bn): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        )\n",
+       "        (relu): ReLU(inplace=True)\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (cls_head): TSNHead(\n",
+       "    (loss_cls): CrossEntropyLoss()\n",
+       "    (consensus): AvgConsensus()\n",
+       "    (avg_pool): AdaptiveAvgPool2d(output_size=(1, 1))\n",
+       "    (dropout): Dropout(p=0.4, inplace=False)\n",
+       "    (fc_cls): Linear(in_features=2048, out_features=2, bias=True)\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 15
+    }
+   ],
+   "source": [
+    "import os.path as osp\n",
+    "import mmengine\n",
+    "from mmengine.runner import Runner\n",
+    "\n",
+    "# Create work_dir\n",
+    "mmengine.mkdir_or_exist(osp.abspath(cfg.work_dir))\n",
+    "\n",
+    "# build the runner from config\n",
+    "runner = Runner.from_cfg(cfg)\n",
+    "\n",
+    "# start training\n",
+    "runner.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "zdSd7oTLlxIf"
+   },
+   "source": [
+    "### Understand the log\n",
+    "From the log, we can have a basic understanding the training process and know how well the recognizer is trained.\n",
+    "\n",
+    "Firstly, the ResNet-50 backbone pre-trained on ImageNet is loaded, this is a common practice since training from scratch is more cost. The log shows that all the weights of the ResNet-50 backbone are loaded except the `fc.bias` and `fc.weight`.\n",
+    "\n",
+    "Second, since the dataset we are using is small, we loaded a TSN model and finetune it for action recognition.\n",
+    "The original TSN is trained on original Kinetics-400 dataset which contains 400 classes but Kinetics-400 Tiny dataset only have 2 classes. Therefore, the last FC layer of the pre-trained TSN for classification has different weight shape and is not used.\n",
+    "\n",
+    "Third, after training, the recognizer is evaluated by the default evaluation. The results show that the recognizer achieves 100% top1 accuracy and 100% top5 accuracy on the val dataset,\n",
+    " \n",
+    "Not bad!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ryVoSfZVmogw"
+   },
+   "source": [
+    "## Test the trained recognizer\n",
+    "\n",
+    "After finetuning the recognizer, let's check the prediction results!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "eyY3hCMwyTct",
+    "outputId": "34fbbdc5-b9fd-4fd2-8030-3ba56b10adbf"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "05/15 03:34:36 - mmengine - INFO - Epoch(test) [10/10]    acc/top1: 0.9000  acc/top5: 1.0000  acc/mean1: 0.9000  data_time: 0.0586  time: 0.7817\n"
+     ]
+    },
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "{'acc/top1': 0.9, 'acc/top5': 1.0, 'acc/mean1': 0.9}"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 16
+    }
+   ],
+   "source": [
+    "runner.test()"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "mmact_dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.12"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "189c342a4747645665e89db23000ac4d4edb7a87c4cd0b2f881610f468fb778d"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/demo/webcam_demo.py b/demo/webcam_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd432b5cc9afc01fa0139d3178b4292ea7a25f22
--- /dev/null
+++ b/demo/webcam_demo.py
@@ -0,0 +1,223 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+from collections import deque
+from operator import itemgetter
+from threading import Thread
+
+import cv2
+import numpy as np
+import torch
+from mmengine import Config, DictAction
+from mmengine.dataset import Compose, pseudo_collate
+
+from mmaction.apis import init_recognizer
+from mmaction.utils import get_str_type
+
+FONTFACE = cv2.FONT_HERSHEY_COMPLEX_SMALL
+FONTSCALE = 1
+FONTCOLOR = (255, 255, 255)  # BGR, white
+MSGCOLOR = (128, 128, 128)  # BGR, gray
+THICKNESS = 1
+LINETYPE = 1
+EXCLUED_STEPS = [
+    'OpenCVInit', 'OpenCVDecode', 'DecordInit', 'DecordDecode', 'PyAVInit',
+    'PyAVDecode', 'RawFrameDecode'
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 webcam demo')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file/url')
+    parser.add_argument('label', help='label file')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--camera-id', type=int, default=0, help='camera device id')
+    parser.add_argument(
+        '--threshold',
+        type=float,
+        default=0.01,
+        help='recognition score threshold')
+    parser.add_argument(
+        '--average-size',
+        type=int,
+        default=1,
+        help='number of latest clips to be averaged for prediction')
+    parser.add_argument(
+        '--drawing-fps',
+        type=int,
+        default=20,
+        help='Set upper bound FPS value of the output drawing')
+    parser.add_argument(
+        '--inference-fps',
+        type=int,
+        default=4,
+        help='Set upper bound FPS value of model inference')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    args = parser.parse_args()
+    assert args.drawing_fps >= 0 and args.inference_fps >= 0, \
+        'upper bound FPS value of drawing and inference should be set as ' \
+        'positive number, or zero for no limit'
+    return args
+
+
+def show_results():
+    print('Press "Esc", "q" or "Q" to exit')
+
+    text_info = {}
+    cur_time = time.time()
+    while True:
+        msg = 'Waiting for action ...'
+        _, frame = camera.read()
+        frame_queue.append(np.array(frame[:, :, ::-1]))
+
+        if len(result_queue) != 0:
+            text_info = {}
+            results = result_queue.popleft()
+            for i, result in enumerate(results):
+                selected_label, score = result
+                if score < threshold:
+                    break
+                location = (0, 40 + i * 20)
+                text = selected_label + ': ' + str(round(score * 100, 2))
+                text_info[location] = text
+                cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                            FONTCOLOR, THICKNESS, LINETYPE)
+
+        elif len(text_info) != 0:
+            for location, text in text_info.items():
+                cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                            FONTCOLOR, THICKNESS, LINETYPE)
+
+        else:
+            cv2.putText(frame, msg, (0, 40), FONTFACE, FONTSCALE, MSGCOLOR,
+                        THICKNESS, LINETYPE)
+
+        cv2.imshow('camera', frame)
+        ch = cv2.waitKey(1)
+
+        if ch == 27 or ch == ord('q') or ch == ord('Q'):
+            camera.release()
+            cv2.destroyAllWindows()
+            break
+
+        if drawing_fps > 0:
+            # add a limiter for actual drawing fps <= drawing_fps
+            sleep_time = 1 / drawing_fps - (time.time() - cur_time)
+            if sleep_time > 0:
+                time.sleep(sleep_time)
+            cur_time = time.time()
+
+
+def inference():
+    score_cache = deque()
+    scores_sum = 0
+    cur_time = time.time()
+    while True:
+        cur_windows = []
+
+        while len(cur_windows) == 0:
+            if len(frame_queue) == sample_length:
+                cur_windows = list(np.array(frame_queue))
+                if data['img_shape'] is None:
+                    data['img_shape'] = frame_queue.popleft().shape[:2]
+
+        cur_data = data.copy()
+        cur_data['imgs'] = cur_windows
+        cur_data = test_pipeline(cur_data)
+        cur_data = pseudo_collate([cur_data])
+
+        # Forward the model
+        with torch.no_grad():
+            result = model.test_step(cur_data)[0]
+        scores = result.pred_score.tolist()
+        scores = np.array(scores)
+        score_cache.append(scores)
+        scores_sum += scores
+
+        if len(score_cache) == average_size:
+            scores_avg = scores_sum / average_size
+            num_selected_labels = min(len(label), 5)
+
+            score_tuples = tuple(zip(label, scores_avg))
+            score_sorted = sorted(
+                score_tuples, key=itemgetter(1), reverse=True)
+            results = score_sorted[:num_selected_labels]
+
+            result_queue.append(results)
+            scores_sum -= score_cache.popleft()
+
+            if inference_fps > 0:
+                # add a limiter for actual inference fps <= inference_fps
+                sleep_time = 1 / inference_fps - (time.time() - cur_time)
+                if sleep_time > 0:
+                    time.sleep(sleep_time)
+                cur_time = time.time()
+
+
+def main():
+    global average_size, threshold, drawing_fps, inference_fps, \
+        device, model, camera, data, label, sample_length, \
+        test_pipeline, frame_queue, result_queue
+
+    args = parse_args()
+    average_size = args.average_size
+    threshold = args.threshold
+    drawing_fps = args.drawing_fps
+    inference_fps = args.inference_fps
+
+    device = torch.device(args.device)
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # Build the recognizer from a config file and checkpoint file/url
+    model = init_recognizer(cfg, args.checkpoint, device=args.device)
+    camera = cv2.VideoCapture(args.camera_id)
+    data = dict(img_shape=None, modality='RGB', label=-1)
+
+    with open(args.label, 'r') as f:
+        label = [line.strip() for line in f]
+
+    # prepare test pipeline from non-camera pipeline
+    cfg = model.cfg
+    sample_length = 0
+    pipeline = cfg.test_pipeline
+    pipeline_ = pipeline.copy()
+    for step in pipeline:
+        if 'SampleFrames' in get_str_type(step['type']):
+            sample_length = step['clip_len'] * step['num_clips']
+            data['num_clips'] = step['num_clips']
+            data['clip_len'] = step['clip_len']
+            pipeline_.remove(step)
+        if get_str_type(step['type']) in EXCLUED_STEPS:
+            # remove step to decode frames
+            pipeline_.remove(step)
+    test_pipeline = Compose(pipeline_)
+
+    assert sample_length > 0
+
+    try:
+        frame_queue = deque(maxlen=sample_length)
+        result_queue = deque(maxlen=1)
+        pw = Thread(target=show_results, args=(), daemon=True)
+        pr = Thread(target=inference, args=(), daemon=True)
+        pw.start()
+        pr.start()
+        pw.join()
+    except KeyboardInterrupt:
+        pass
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/webcam_demo_spatiotemporal_det.py b/demo/webcam_demo_spatiotemporal_det.py
new file mode 100644
index 0000000000000000000000000000000000000000..75a534bdf65f07817aed1966d484cc02e0f54abc
--- /dev/null
+++ b/demo/webcam_demo_spatiotemporal_det.py
@@ -0,0 +1,864 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Webcam Spatio-Temporal Action Detection Demo.
+
+Some codes are based on https://github.com/facebookresearch/SlowFast
+"""
+
+import argparse
+import atexit
+import copy
+import logging
+import queue
+import threading
+import time
+from abc import ABCMeta, abstractmethod
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmengine import Config, DictAction
+from mmengine.structures import InstanceData
+
+from mmaction.structures import ActionDataSample
+
+try:
+    from mmdet.apis import inference_detector, init_detector
+except (ImportError, ModuleNotFoundError):
+    raise ImportError('Failed to import `inference_detector` and '
+                      '`init_detector` form `mmdet.apis`. These apis are '
+                      'required in this demo! ')
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMAction2 webcam spatio-temporal detection demo')
+
+    parser.add_argument(
+        '--config',
+        default=(
+            'configs/detection/slowonly/'
+            'slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py'
+        ),
+        help='spatio temporal detection config file path')
+    parser.add_argument(
+        '--checkpoint',
+        default=('https://download.openmmlab.com/mmaction/detection/ava/'
+                 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/'
+                 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb'
+                 '_20201217-16378594.pth'),
+        help='spatio temporal detection checkpoint file/url')
+    parser.add_argument(
+        '--action-score-thr',
+        type=float,
+        default=0.4,
+        help='the threshold of human action score')
+    parser.add_argument(
+        '--det-config',
+        default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py',
+        help='human detection config file path (from mmdet)')
+    parser.add_argument(
+        '--det-checkpoint',
+        default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
+                 'faster_rcnn_r50_fpn_2x_coco/'
+                 'faster_rcnn_r50_fpn_2x_coco_'
+                 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
+        help='human detection checkpoint file/url')
+    parser.add_argument(
+        '--det-score-thr',
+        type=float,
+        default=0.9,
+        help='the threshold of human detection score')
+    parser.add_argument(
+        '--input-video',
+        default='0',
+        type=str,
+        help='webcam id or input video file/url')
+    parser.add_argument(
+        '--label-map',
+        default='tools/data/ava/label_map.txt',
+        help='label map file')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--output-fps',
+        default=15,
+        type=int,
+        help='the fps of demo video output')
+    parser.add_argument(
+        '--out-filename',
+        default=None,
+        type=str,
+        help='the filename of output video')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Whether to show results with cv2.imshow')
+    parser.add_argument(
+        '--display-height',
+        type=int,
+        default=0,
+        help='Image height for human detector and draw frames.')
+    parser.add_argument(
+        '--display-width',
+        type=int,
+        default=0,
+        help='Image width for human detector and draw frames.')
+    parser.add_argument(
+        '--predict-stepsize',
+        default=8,
+        type=int,
+        help='give out a prediction per n frames')
+    parser.add_argument(
+        '--clip-vis-length',
+        default=8,
+        type=int,
+        help='Number of draw frames per clip.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+
+    args = parser.parse_args()
+    return args
+
+
+class TaskInfo:
+    """Wapper for a clip.
+
+    Transmit data around three threads.
+
+    1) Read Thread: Create task and put task into read queue. Init `frames`,
+        `processed_frames`, `img_shape`, `ratio`, `clip_vis_length`.
+    2) Main Thread: Get data from read queue, predict human bboxes and stdet
+        action labels, draw predictions and put task into display queue. Init
+        `display_bboxes`, `stdet_bboxes` and `action_preds`, update `frames`.
+    3) Display Thread: Get data from display queue, show/write frames and
+        delete task.
+    """
+
+    def __init__(self):
+        self.id = -1
+
+        # raw frames, used as human detector input, draw predictions input
+        # and output, display input
+        self.frames = None
+
+        # stdet params
+        self.processed_frames = None  # model inputs
+        self.frames_inds = None  # select frames from processed frames
+        self.img_shape = None  # model inputs, processed frame shape
+        # `action_preds` is `list[list[tuple]]`. The outer brackets indicate
+        # different bboxes and the intter brackets indicate different action
+        # results for the same bbox. tuple contains `class_name` and `score`.
+        self.action_preds = None  # stdet results
+
+        # human bboxes with the format (xmin, ymin, xmax, ymax)
+        self.display_bboxes = None  # bboxes coords for self.frames
+        self.stdet_bboxes = None  # bboxes coords for self.processed_frames
+        self.ratio = None  # processed_frames.shape[1::-1]/frames.shape[1::-1]
+
+        # for each clip, draw predictions on clip_vis_length frames
+        self.clip_vis_length = -1
+
+    def add_frames(self, idx, frames, processed_frames):
+        """Add the clip and corresponding id.
+
+        Args:
+            idx (int): the current index of the clip.
+            frames (list[ndarray]): list of images in "BGR" format.
+            processed_frames (list[ndarray]): list of resize and normed images
+                in "BGR" format.
+        """
+        self.frames = frames
+        self.processed_frames = processed_frames
+        self.id = idx
+        self.img_shape = processed_frames[0].shape[:2]
+
+    def add_bboxes(self, display_bboxes):
+        """Add correspondding bounding boxes."""
+        self.display_bboxes = display_bboxes
+        self.stdet_bboxes = display_bboxes.clone()
+        self.stdet_bboxes[:, ::2] = self.stdet_bboxes[:, ::2] * self.ratio[0]
+        self.stdet_bboxes[:, 1::2] = self.stdet_bboxes[:, 1::2] * self.ratio[1]
+
+    def add_action_preds(self, preds):
+        """Add the corresponding action predictions."""
+        self.action_preds = preds
+
+    def get_model_inputs(self, device):
+        """Convert preprocessed images to MMAction2 STDet model inputs."""
+        cur_frames = [self.processed_frames[idx] for idx in self.frames_inds]
+        input_array = np.stack(cur_frames).transpose((3, 0, 1, 2))[np.newaxis]
+        input_tensor = torch.from_numpy(input_array).to(device)
+        datasample = ActionDataSample()
+        datasample.proposals = InstanceData(bboxes=self.stdet_bboxes)
+        datasample.set_metainfo(dict(img_shape=self.img_shape))
+
+        return dict(
+            inputs=input_tensor, data_samples=[datasample], mode='predict')
+
+
+class BaseHumanDetector(metaclass=ABCMeta):
+    """Base class for Human Dector.
+
+    Args:
+        device (str): CPU/CUDA device option.
+    """
+
+    def __init__(self, device):
+        self.device = torch.device(device)
+
+    @abstractmethod
+    def _do_detect(self, image):
+        """Get human bboxes with shape [n, 4].
+
+        The format of bboxes is (xmin, ymin, xmax, ymax) in pixels.
+        """
+
+    def predict(self, task):
+        """Add keyframe bboxes to task."""
+        # keyframe idx == (clip_len * frame_interval) // 2
+        keyframe = task.frames[len(task.frames) // 2]
+
+        # call detector
+        bboxes = self._do_detect(keyframe)
+
+        # convert bboxes to torch.Tensor and move to target device
+        if isinstance(bboxes, np.ndarray):
+            bboxes = torch.from_numpy(bboxes).to(self.device)
+        elif isinstance(bboxes, torch.Tensor) and bboxes.device != self.device:
+            bboxes = bboxes.to(self.device)
+
+        # update task
+        task.add_bboxes(bboxes)
+
+        return task
+
+
+class MmdetHumanDetector(BaseHumanDetector):
+    """Wrapper for mmdetection human detector.
+
+    Args:
+        config (str): Path to mmdetection config.
+        ckpt (str): Path to mmdetection checkpoint.
+        device (str): CPU/CUDA device option.
+        score_thr (float): The threshold of human detection score.
+        person_classid (int): Choose class from detection results.
+            Default: 0. Suitable for COCO pretrained models.
+    """
+
+    def __init__(self, config, ckpt, device, score_thr, person_classid=0):
+        super().__init__(device)
+        self.model = init_detector(config, ckpt, device=device)
+        self.person_classid = person_classid
+        self.score_thr = score_thr
+
+    def _do_detect(self, image):
+        """Get bboxes in shape [n, 4] and values in pixels."""
+        det_data_sample = inference_detector(self.model, image)
+        pred_instance = det_data_sample.pred_instances.cpu().numpy()
+        # We only keep human detection bboxs with score larger
+        # than `det_score_thr` and category id equal to `det_cat_id`.
+        valid_idx = np.logical_and(pred_instance.labels == self.person_classid,
+                                   pred_instance.scores > self.score_thr)
+        bboxes = pred_instance.bboxes[valid_idx]
+        # result = result[result[:, 4] >= self.score_thr][:, :4]
+        return bboxes
+
+
+class StdetPredictor:
+    """Wrapper for MMAction2 spatio-temporal action models.
+
+    Args:
+        config (str): Path to stdet config.
+        ckpt (str): Path to stdet checkpoint.
+        device (str): CPU/CUDA device option.
+        score_thr (float): The threshold of human action score.
+        label_map_path (str): Path to label map file. The format for each line
+            is `{class_id}: {class_name}`.
+    """
+
+    def __init__(self, config, checkpoint, device, score_thr, label_map_path):
+        self.score_thr = score_thr
+
+        # load model
+        config.model.backbone.pretrained = None
+        # model = build_detector(config.model, test_cfg=config.get('test_cfg'))
+        # load_checkpoint(model, checkpoint, map_location='cpu')
+        # model.to(device)
+        # model.eval()
+        model = init_detector(config, checkpoint, device=device)
+        self.model = model
+        self.device = device
+
+        # init label map, aka class_id to class_name dict
+        with open(label_map_path) as f:
+            lines = f.readlines()
+        lines = [x.strip().split(': ') for x in lines]
+        self.label_map = {int(x[0]): x[1] for x in lines}
+        try:
+            if config['data']['train']['custom_classes'] is not None:
+                self.label_map = {
+                    id + 1: self.label_map[cls]
+                    for id, cls in enumerate(config['data']['train']
+                                             ['custom_classes'])
+                }
+        except KeyError:
+            pass
+
+    def predict(self, task):
+        """Spatio-temporval Action Detection model inference."""
+        # No need to do inference if no one in keyframe
+        if len(task.stdet_bboxes) == 0:
+            return task
+
+        with torch.no_grad():
+            result = self.model(**task.get_model_inputs(self.device))
+        scores = result[0].pred_instances.scores
+        # pack results of human detector and stdet
+        preds = []
+        for _ in range(task.stdet_bboxes.shape[0]):
+            preds.append([])
+        for class_id in range(scores.shape[1]):
+            if class_id not in self.label_map:
+                continue
+            for bbox_id in range(task.stdet_bboxes.shape[0]):
+                if scores[bbox_id][class_id] > self.score_thr:
+                    preds[bbox_id].append((self.label_map[class_id],
+                                           scores[bbox_id][class_id].item()))
+
+        # update task
+        # `preds` is `list[list[tuple]]`. The outer brackets indicate
+        # different bboxes and the intter brackets indicate different action
+        # results for the same bbox. tuple contains `class_name` and `score`.
+        task.add_action_preds(preds)
+
+        return task
+
+
+class ClipHelper:
+    """Multithrading utils to manage the lifecycle of task."""
+
+    def __init__(self,
+                 config,
+                 display_height=0,
+                 display_width=0,
+                 input_video=0,
+                 predict_stepsize=40,
+                 output_fps=25,
+                 clip_vis_length=8,
+                 out_filename=None,
+                 show=True,
+                 stdet_input_shortside=256):
+        # stdet sampling strategy
+        val_pipeline = config.val_pipeline
+        sampler = [x for x in val_pipeline
+                   if x['type'] == 'SampleAVAFrames'][0]
+        clip_len, frame_interval = sampler['clip_len'], sampler[
+            'frame_interval']
+        self.window_size = clip_len * frame_interval
+
+        # asserts
+        assert (out_filename or show), \
+            'out_filename and show cannot both be None'
+        assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+        assert clip_vis_length <= predict_stepsize
+        assert 0 < predict_stepsize <= self.window_size
+
+        # source params
+        try:
+            self.cap = cv2.VideoCapture(int(input_video))
+            self.webcam = True
+        except ValueError:
+            self.cap = cv2.VideoCapture(input_video)
+            self.webcam = False
+        assert self.cap.isOpened()
+
+        # stdet input preprocessing params
+        h = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        w = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        self.stdet_input_size = mmcv.rescale_size(
+            (w, h), (stdet_input_shortside, np.Inf))
+        img_norm_cfg = dict(
+            mean=np.array(config.model.data_preprocessor.mean),
+            std=np.array(config.model.data_preprocessor.std),
+            to_rgb=False)
+        self.img_norm_cfg = img_norm_cfg
+
+        # task init params
+        self.clip_vis_length = clip_vis_length
+        self.predict_stepsize = predict_stepsize
+        self.buffer_size = self.window_size - self.predict_stepsize
+        frame_start = self.window_size // 2 - (clip_len // 2) * frame_interval
+        self.frames_inds = [
+            frame_start + frame_interval * i for i in range(clip_len)
+        ]
+        self.buffer = []
+        self.processed_buffer = []
+
+        # output/display params
+        if display_height > 0 and display_width > 0:
+            self.display_size = (display_width, display_height)
+        elif display_height > 0 or display_width > 0:
+            self.display_size = mmcv.rescale_size(
+                (w, h), (np.Inf, max(display_height, display_width)))
+        else:
+            self.display_size = (w, h)
+        self.ratio = tuple(
+            n / o for n, o in zip(self.stdet_input_size, self.display_size))
+        if output_fps <= 0:
+            self.output_fps = int(self.cap.get(cv2.CAP_PROP_FPS))
+        else:
+            self.output_fps = output_fps
+        self.show = show
+        self.video_writer = None
+        if out_filename is not None:
+            self.video_writer = self.get_output_video_writer(out_filename)
+        display_start_idx = self.window_size // 2 - self.predict_stepsize // 2
+        self.display_inds = [
+            display_start_idx + i for i in range(self.predict_stepsize)
+        ]
+
+        # display multi-theading params
+        self.display_id = -1  # task.id for display queue
+        self.display_queue = {}
+        self.display_lock = threading.Lock()
+        self.output_lock = threading.Lock()
+
+        # read multi-theading params
+        self.read_id = -1  # task.id for read queue
+        self.read_id_lock = threading.Lock()
+        self.read_queue = queue.Queue()
+        self.read_lock = threading.Lock()
+        self.not_end = True  # cap.read() flag
+
+        # program state
+        self.stopped = False
+
+        atexit.register(self.clean)
+
+    def read_fn(self):
+        """Main function for read thread.
+
+        Contains three steps:
+
+        1) Read and preprocess (resize + norm) frames from source.
+        2) Create task by frames from previous step and buffer.
+        3) Put task into read queue.
+        """
+        was_read = True
+        start_time = time.time()
+        while was_read and not self.stopped:
+            # init task
+            task = TaskInfo()
+            task.clip_vis_length = self.clip_vis_length
+            task.frames_inds = self.frames_inds
+            task.ratio = self.ratio
+
+            # read buffer
+            frames = []
+            processed_frames = []
+            if len(self.buffer) != 0:
+                frames = self.buffer
+            if len(self.processed_buffer) != 0:
+                processed_frames = self.processed_buffer
+
+            # read and preprocess frames from source and update task
+            with self.read_lock:
+                before_read = time.time()
+                read_frame_cnt = self.window_size - len(frames)
+                while was_read and len(frames) < self.window_size:
+                    was_read, frame = self.cap.read()
+                    if not self.webcam:
+                        # Reading frames too fast may lead to unexpected
+                        # performance degradation. If you have enough
+                        # resource, this line could be commented.
+                        time.sleep(1 / self.output_fps)
+                    if was_read:
+                        frames.append(mmcv.imresize(frame, self.display_size))
+                        processed_frame = mmcv.imresize(
+                            frame, self.stdet_input_size).astype(np.float32)
+                        _ = mmcv.imnormalize_(processed_frame,
+                                              **self.img_norm_cfg)
+                        processed_frames.append(processed_frame)
+            task.add_frames(self.read_id + 1, frames, processed_frames)
+
+            # update buffer
+            if was_read:
+                self.buffer = frames[-self.buffer_size:]
+                self.processed_buffer = processed_frames[-self.buffer_size:]
+
+            # update read state
+            with self.read_id_lock:
+                self.read_id += 1
+                self.not_end = was_read
+
+            self.read_queue.put((was_read, copy.deepcopy(task)))
+            cur_time = time.time()
+            logger.debug(
+                f'Read thread: {1000*(cur_time - start_time):.0f} ms, '
+                f'{read_frame_cnt / (cur_time - before_read):.0f} fps')
+            start_time = cur_time
+
+    def display_fn(self):
+        """Main function for display thread.
+
+        Read data from display queue and display predictions.
+        """
+        start_time = time.time()
+        while not self.stopped:
+            # get the state of the read thread
+            with self.read_id_lock:
+                read_id = self.read_id
+                not_end = self.not_end
+
+            with self.display_lock:
+                # If video ended and we have display all frames.
+                if not not_end and self.display_id == read_id:
+                    break
+
+                # If the next task are not available, wait.
+                if (len(self.display_queue) == 0 or
+                        self.display_queue.get(self.display_id + 1) is None):
+                    time.sleep(0.02)
+                    continue
+
+                # get display data and update state
+                self.display_id += 1
+                was_read, task = self.display_queue[self.display_id]
+                del self.display_queue[self.display_id]
+                display_id = self.display_id
+
+            # do display predictions
+            with self.output_lock:
+                if was_read and task.id == 0:
+                    # the first task
+                    cur_display_inds = range(self.display_inds[-1] + 1)
+                elif not was_read:
+                    # the last task
+                    cur_display_inds = range(self.display_inds[0],
+                                             len(task.frames))
+                else:
+                    cur_display_inds = self.display_inds
+
+                for frame_id in cur_display_inds:
+                    frame = task.frames[frame_id]
+                    if self.show:
+                        cv2.imshow('Demo', frame)
+                        cv2.waitKey(int(1000 / self.output_fps))
+                    if self.video_writer:
+                        self.video_writer.write(frame)
+
+            cur_time = time.time()
+            logger.debug(
+                f'Display thread: {1000*(cur_time - start_time):.0f} ms, '
+                f'read id {read_id}, display id {display_id}')
+            start_time = cur_time
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        """Get data from read queue.
+
+        This function is part of the main thread.
+        """
+        if self.read_queue.qsize() == 0:
+            time.sleep(0.02)
+            return not self.stopped, None
+
+        was_read, task = self.read_queue.get()
+        if not was_read:
+            # If we reach the end of the video, there aren't enough frames
+            # in the task.processed_frames, so no need to model inference
+            # and draw predictions. Put task into display queue.
+            with self.read_id_lock:
+                read_id = self.read_id
+            with self.display_lock:
+                self.display_queue[read_id] = was_read, copy.deepcopy(task)
+
+            # main thread doesn't need to handle this task again
+            task = None
+        return was_read, task
+
+    def start(self):
+        """Start read thread and display thread."""
+        self.read_thread = threading.Thread(
+            target=self.read_fn, args=(), name='VidRead-Thread', daemon=True)
+        self.read_thread.start()
+        self.display_thread = threading.Thread(
+            target=self.display_fn,
+            args=(),
+            name='VidDisplay-Thread',
+            daemon=True)
+        self.display_thread.start()
+
+        return self
+
+    def clean(self):
+        """Close all threads and release all resources."""
+        self.stopped = True
+        self.read_lock.acquire()
+        self.cap.release()
+        self.read_lock.release()
+        self.output_lock.acquire()
+        cv2.destroyAllWindows()
+        if self.video_writer:
+            self.video_writer.release()
+        self.output_lock.release()
+
+    def join(self):
+        """Waiting for the finalization of read and display thread."""
+        self.read_thread.join()
+        self.display_thread.join()
+
+    def display(self, task):
+        """Add the visualized task to the display queue.
+
+        Args:
+            task (TaskInfo object): task object that contain the necessary
+            information for prediction visualization.
+        """
+        with self.display_lock:
+            self.display_queue[task.id] = (True, task)
+
+    def get_output_video_writer(self, path):
+        """Return a video writer object.
+
+        Args:
+            path (str): path to the output video file.
+        """
+        return cv2.VideoWriter(
+            filename=path,
+            fourcc=cv2.VideoWriter_fourcc(*'mp4v'),
+            fps=float(self.output_fps),
+            frameSize=self.display_size,
+            isColor=True)
+
+
+class BaseVisualizer(metaclass=ABCMeta):
+    """Base class for visualization tools."""
+
+    def __init__(self, max_labels_per_bbox):
+        self.max_labels_per_bbox = max_labels_per_bbox
+
+    def draw_predictions(self, task):
+        """Visualize stdet predictions on raw frames."""
+        # read bboxes from task
+        bboxes = task.display_bboxes.cpu().numpy()
+
+        # draw predictions and update task
+        keyframe_idx = len(task.frames) // 2
+        draw_range = [
+            keyframe_idx - task.clip_vis_length // 2,
+            keyframe_idx + (task.clip_vis_length - 1) // 2
+        ]
+        assert draw_range[0] >= 0 and draw_range[1] < len(task.frames)
+        task.frames = self.draw_clip_range(task.frames, task.action_preds,
+                                           bboxes, draw_range)
+
+        return task
+
+    def draw_clip_range(self, frames, preds, bboxes, draw_range):
+        """Draw a range of frames with the same bboxes and predictions."""
+        # no predictions to be draw
+        if bboxes is None or len(bboxes) == 0:
+            return frames
+
+        # draw frames in `draw_range`
+        left_frames = frames[:draw_range[0]]
+        right_frames = frames[draw_range[1] + 1:]
+        draw_frames = frames[draw_range[0]:draw_range[1] + 1]
+
+        # get labels(texts) and draw predictions
+        draw_frames = [
+            self.draw_one_image(frame, bboxes, preds) for frame in draw_frames
+        ]
+
+        return list(left_frames) + draw_frames + list(right_frames)
+
+    @abstractmethod
+    def draw_one_image(self, frame, bboxes, preds):
+        """Draw bboxes and corresponding texts on one frame."""
+
+    @staticmethod
+    def abbrev(name):
+        """Get the abbreviation of label name:
+
+        'take (an object) from (a person)' -> 'take ... from ...'
+        """
+        while name.find('(') != -1:
+            st, ed = name.find('('), name.find(')')
+            name = name[:st] + '...' + name[ed + 1:]
+        return name
+
+
+class DefaultVisualizer(BaseVisualizer):
+    """Tools to visualize predictions.
+
+    Args:
+        max_labels_per_bbox (int): Max number of labels to visualize for a
+            person box. Default: 5.
+        plate (str): The color plate used for visualization. Two recommended
+            plates are blue plate `03045e-023e8a-0077b6-0096c7-00b4d8-48cae4`
+            and green plate `004b23-006400-007200-008000-38b000-70e000`. These
+            plates are generated by https://coolors.co/.
+            Default: '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'.
+        text_fontface (int): Fontface from OpenCV for texts.
+            Default: cv2.FONT_HERSHEY_DUPLEX.
+        text_fontscale (float): Fontscale from OpenCV for texts.
+            Default: 0.5.
+        text_fontcolor (tuple): fontface from OpenCV for texts.
+            Default: (255, 255, 255).
+        text_thickness (int): Thickness from OpenCV for texts.
+            Default: 1.
+        text_linetype (int): LInetype from OpenCV for texts.
+            Default: 1.
+    """
+
+    def __init__(
+            self,
+            max_labels_per_bbox=5,
+            plate='03045e-023e8a-0077b6-0096c7-00b4d8-48cae4',
+            text_fontface=cv2.FONT_HERSHEY_DUPLEX,
+            text_fontscale=0.5,
+            text_fontcolor=(255, 255, 255),  # white
+            text_thickness=1,
+            text_linetype=1):
+        super().__init__(max_labels_per_bbox=max_labels_per_bbox)
+        self.text_fontface = text_fontface
+        self.text_fontscale = text_fontscale
+        self.text_fontcolor = text_fontcolor
+        self.text_thickness = text_thickness
+        self.text_linetype = text_linetype
+
+        def hex2color(h):
+            """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
+            return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
+
+        plate = plate.split('-')
+        self.plate = [hex2color(h) for h in plate]
+
+    def draw_one_image(self, frame, bboxes, preds):
+        """Draw predictions on one image."""
+        for bbox, pred in zip(bboxes, preds):
+            # draw bbox
+            box = bbox.astype(np.int64)
+            st, ed = tuple(box[:2]), tuple(box[2:])
+            cv2.rectangle(frame, st, ed, (0, 0, 255), 2)
+
+            # draw texts
+            for k, (label, score) in enumerate(pred):
+                if k >= self.max_labels_per_bbox:
+                    break
+                text = f'{self.abbrev(label)}: {score:.4f}'
+                location = (0 + st[0], 18 + k * 18 + st[1])
+                textsize = cv2.getTextSize(text, self.text_fontface,
+                                           self.text_fontscale,
+                                           self.text_thickness)[0]
+                textwidth = textsize[0]
+                diag0 = (location[0] + textwidth, location[1] - 14)
+                diag1 = (location[0], location[1] + 2)
+                cv2.rectangle(frame, diag0, diag1, self.plate[k + 1], -1)
+                cv2.putText(frame, text, location, self.text_fontface,
+                            self.text_fontscale, self.text_fontcolor,
+                            self.text_thickness, self.text_linetype)
+
+        return frame
+
+
+def main(args):
+    # init human detector
+    human_detector = MmdetHumanDetector(args.det_config, args.det_checkpoint,
+                                        args.device, args.det_score_thr)
+
+    # init action detector
+    config = Config.fromfile(args.config)
+    config.merge_from_dict(args.cfg_options)
+
+    try:
+        # In our spatiotemporal detection demo, different actions should have
+        # the same number of bboxes.
+        config['model']['test_cfg']['rcnn'] = dict(action_thr=0)
+    except KeyError:
+        pass
+    stdet_predictor = StdetPredictor(
+        config=config,
+        checkpoint=args.checkpoint,
+        device=args.device,
+        score_thr=args.action_score_thr,
+        label_map_path=args.label_map)
+
+    # init clip helper
+    clip_helper = ClipHelper(
+        config=config,
+        display_height=args.display_height,
+        display_width=args.display_width,
+        input_video=args.input_video,
+        predict_stepsize=args.predict_stepsize,
+        output_fps=args.output_fps,
+        clip_vis_length=args.clip_vis_length,
+        out_filename=args.out_filename,
+        show=args.show)
+
+    # init visualizer
+    vis = DefaultVisualizer()
+
+    # start read and display thread
+    clip_helper.start()
+
+    try:
+        # Main thread main function contains:
+        # 1) get data from read queue
+        # 2) get human bboxes and stdet predictions
+        # 3) draw stdet predictions and update task
+        # 4) put task into display queue
+        for able_to_read, task in clip_helper:
+            # get data from read queue
+
+            if not able_to_read:
+                # read thread is dead and all tasks are processed
+                break
+
+            if task is None:
+                # when no data in read queue, wait
+                time.sleep(0.01)
+                continue
+
+            inference_start = time.time()
+
+            # get human bboxes
+            human_detector.predict(task)
+
+            # get stdet predictions
+            stdet_predictor.predict(task)
+
+            # draw stdet predictions in raw frames
+            vis.draw_predictions(task)
+            logger.info(f'Stdet Results: {task.action_preds}')
+
+            # add draw frames to display queue
+            clip_helper.display(task)
+
+            logger.debug('Main thread inference time '
+                         f'{1000*(time.time() - inference_start):.0f} ms')
+
+        # wait for display thread
+        clip_helper.join()
+    except KeyboardInterrupt:
+        pass
+    finally:
+        # close read & display thread, release all resources
+        clip_helper.clean()
+
+
+if __name__ == '__main__':
+    main(parse_args())
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..59dbb71933a017f8f1acdfdbcfb5c9ee1c04419b
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,30 @@
+ARG PYTORCH="1.8.1"
+ARG CUDA="10.2"
+ARG CUDNN="7"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
+ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
+
+# fetch the key refer to https://forums.developer.nvidia.com/t/18-04-cuda-docker-image-is-broken/212892/9
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub 32
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 ffmpeg \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install MMCV
+RUN pip install openmim
+RUN mim install mmengine mmcv
+
+# Install MMAction2
+RUN conda clean --all
+RUN git clone https://github.com/open-mmlab/mmaction2.git /mmaction2
+WORKDIR /mmaction2
+RUN mkdir -p /mmaction2/data
+ENV FORCE_CUDA="1"
+RUN git checkout main
+RUN pip install cython --no-cache-dir
+RUN pip install --no-cache-dir -e .
diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..b9525bb853e9042748d4cfa4106a52caa01c9136
--- /dev/null
+++ b/docker/serve/Dockerfile
@@ -0,0 +1,51 @@
+ARG PYTORCH="1.9.0"
+ARG CUDA="10.2"
+ARG CUDNN="7"
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+ARG MMCV="1.3.8"
+ARG MMACTION="0.24.0"
+
+ENV PYTHONUNBUFFERED TRUE
+
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    g++ \
+    openjdk-11-jre-headless \
+    # MMDET Requirements
+    ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
+    libsndfile1 libturbojpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV PATH="/opt/conda/bin:$PATH"
+RUN export FORCE_CUDA=1
+
+# TORCHSEVER
+RUN pip install torchserve torch-model-archiver
+
+# MMLAB
+ARG PYTORCH
+ARG CUDA
+RUN ["/bin/bash", "-c", "pip install mmcv-full==${MMCV} -f https://download.openmmlab.com/mmcv/dist/cu${CUDA//./}/torch${PYTORCH}/index.html"]
+# RUN pip install mmaction2==${MMACTION}
+RUN pip install git+https://github.com/open-mmlab/mmaction2.git
+
+RUN useradd -m model-server \
+    && mkdir -p /home/model-server/tmp
+
+COPY entrypoint.sh /usr/local/bin/entrypoint.sh
+
+RUN chmod +x /usr/local/bin/entrypoint.sh \
+    && chown -R model-server /home/model-server
+
+COPY config.properties /home/model-server/config.properties
+RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store
+
+EXPOSE 8080 8081 8082
+
+USER model-server
+WORKDIR /home/model-server
+ENV TEMP=/home/model-server/tmp
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["serve"]
diff --git a/docker/serve/config.properties b/docker/serve/config.properties
new file mode 100644
index 0000000000000000000000000000000000000000..dd9a685150199972c02d2c8bdcd910ee5c1a3ce4
--- /dev/null
+++ b/docker/serve/config.properties
@@ -0,0 +1,5 @@
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
+model_store=/home/model-server/model-store
+load_models=all
diff --git a/docker/serve/entrypoint.sh b/docker/serve/entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d9aedae68fa0938c6ec096930375661ab49889b9
--- /dev/null
+++ b/docker/serve/entrypoint.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+if [[ "$1" = "serve" ]]; then
+    shift 1
+    torchserve --start --ts-config /home/model-server/config.properties
+else
+    eval "$@"
+fi
+
+# prevent docker exit
+tail -f /dev/null
diff --git a/docs/en/Makefile b/docs/en/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..73a28c7134cd1760744f34bac4ebdedfbed40f72
--- /dev/null
+++ b/docs/en/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/en/_static/css/readthedocs.css b/docs/en/_static/css/readthedocs.css
new file mode 100644
index 0000000000000000000000000000000000000000..55b3d3f8ffde0fd0e9d00e7f8b73124bba6cfe2d
--- /dev/null
+++ b/docs/en/_static/css/readthedocs.css
@@ -0,0 +1,62 @@
+.header-logo {
+    background-image: url("../images/logo.png");
+    background-size: 130px 40px;
+    height: 40px;
+    width: 130px;
+}
+
+@media screen and (min-width: 1100px) {
+    .header-logo {
+      top: -12px;
+    }
+  }
+
+  pre {
+      white-space: pre;
+  }
+
+  @media screen and (min-width: 2000px) {
+    .pytorch-content-left {
+      width: 1200px;
+      margin-left: 30px;
+    }
+    article.pytorch-article {
+      max-width: 1200px;
+    }
+    .pytorch-breadcrumbs-wrapper {
+      width: 1200px;
+    }
+    .pytorch-right-menu.scrolling-fixed {
+      position: fixed;
+      top: 45px;
+      left: 1580px;
+    }
+  }
+
+
+  article.pytorch-article section code {
+    padding: .2em .4em;
+    background-color: #f3f4f7;
+    border-radius: 5px;
+  }
+
+  /* Disable the change in tables */
+  article.pytorch-article section table code {
+    padding: unset;
+    background-color: unset;
+    border-radius: unset;
+  }
+
+  table.autosummary td {
+    width: 50%
+  }
+
+  img.align-center {
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+  }
+
+  article.pytorch-article p.rubric {
+    font-weight: bold;
+  }
diff --git a/docs/en/_static/images/logo.png b/docs/en/_static/images/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..f0c759bb78c5424b4394d18a5ba833a8c9f43add
Binary files /dev/null and b/docs/en/_static/images/logo.png differ
diff --git a/docs/en/_static/js/custom.js b/docs/en/_static/js/custom.js
new file mode 100644
index 0000000000000000000000000000000000000000..207dcb32ae79fa2f72220e39816011ce5c1c77c2
--- /dev/null
+++ b/docs/en/_static/js/custom.js
@@ -0,0 +1,10 @@
+var collapsedSections = ['Dataset Zoo'];
+
+$(document).ready(function () {
+    $('.model-summary').DataTable({
+      "stateSave": false,
+      "lengthChange": false,
+      "pageLength": 20,
+      "order": []
+    });
+  });
diff --git a/docs/en/_templates/404.html b/docs/en/_templates/404.html
new file mode 100644
index 0000000000000000000000000000000000000000..3dcff6e0ca7f27da4a0d379c9c34aeb087ed7f9e
--- /dev/null
+++ b/docs/en/_templates/404.html
@@ -0,0 +1,18 @@
+{% extends "layout.html" %}
+
+{% block body %}
+
+<h1>Page Not Found</h1>
+<p>
+  The page you are looking for cannot be found.
+</p>
+<p>
+  If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
+  the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
+</p>
+<p>
+  If you cannot find documentation you want, please <a
+    href="https://github.com/open-mmlab/mmaction2/issues/new/choose">open an issue</a> to tell us!
+</p>
+
+{% endblock %}
diff --git a/docs/en/advanced_guides/customize_dataset.md b/docs/en/advanced_guides/customize_dataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..7cfa14770fc9c802e36c913cfd1de1ab8aef562b
--- /dev/null
+++ b/docs/en/advanced_guides/customize_dataset.md
@@ -0,0 +1,121 @@
+# Customize Dataset
+
+In this tutorial, we will introduce some methods about how to customize your own dataset by online conversion.
+
+- [Customize Dataset](#customize-dataset)
+  - [General understanding of the Dataset in MMAction2](#general-understanding-of-the-dataset-in-mmaction2)
+  - [Customize new datasets](#customize-new-datasets)
+  - [Customize keypoint format for PoseDataset](#customize-keypoint-format-for-posedataset)
+
+## General understanding of the Dataset in MMAction2
+
+MMAction2 provides task-specific `Dataset` class, e.g. `VideoDataset`/`RawframeDataset` for action recognition, `AVADataset` for spatio-temporal action detection, `PoseDataset` for skeleton-based action recognition. These task-specific datasets only require the implementation of `load_data_list(self)` for generating a data list from the annotation file. The remaining functions are automatically handled by the superclass (i.e., `BaseActionDataset` and `BaseDataset`). The following table shows the inheritance relationship and the main method of the modules.
+
+| Class Name                     | Class Method                                                                                                                                                               |
+| ------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `MMAction2::VideoDataset`      | `load_data_list(self)` <br> Build data list from the annotation file.                                                                                                      |
+| `MMAction2::BaseActionDataset` | `get_data_info(self, idx)` <br> Given the `idx`, return the corresponding data sample from the data list.                                                                  |
+| `MMEngine::BaseDataset`        | `__getitem__(self, idx)` <br> Given the `idx`, call `get_data_info` to get the data sample, then call the `pipeline` to perform transforms and augmentation in `train_pipeline` or `val_pipeline` . |
+
+## Customize new datasets
+
+Although offline conversion is the preferred method for utilizing your own data in most cases, MMAction2 offers a convenient process for creating a customized `Dataset` class. As mentioned previously, task-specific datasets only require the implementation of `load_data_list(self)` for generating a data list from the annotation file. It is noteworthy that the elements in the `data_list` are `dict` with fields that are essential for the subsequent processes in the `pipeline`.
+
+Taking `VideoDataset` as an example, `train_pipeline`/`val_pipeline` require `'filename'` in `DecordInit` and `'label'` in `PackActionInputs`. Consequently, the data samples in the `data_list` must contain 2 fields: `'filename'` and `'label'`.
+Please refer to [customize pipeline](customize_pipeline.md) for more details about the `pipeline`.
+
+```
+data_list.append(dict(filename=filename, label=label))
+```
+
+However, `AVADataset` is more complex, data samples in the `data_list` consist of several fields about the video data. Moreover, it overwrites `get_data_info(self, idx)` to convert keys that are indispensable in the spatio-temporal action detection pipeline.
+
+```python
+
+class AVADataset(BaseActionDataset):
+  ...
+
+  def load_data_list(self) -> List[dict]:
+      ...
+        video_info = dict(
+            frame_dir=frame_dir,
+            video_id=video_id,
+            timestamp=int(timestamp),
+            img_key=img_key,
+            shot_info=shot_info,
+            fps=self._FPS,
+            ann=ann)
+            data_list.append(video_info)
+        data_list.append(video_info)
+      return data_list
+
+  def get_data_info(self, idx: int) -> dict:
+      ...
+      ann = data_info.pop('ann')
+      data_info['gt_bboxes'] = ann['gt_bboxes']
+      data_info['gt_labels'] = ann['gt_labels']
+      data_info['entity_ids'] = ann['entity_ids']
+      return data_info
+```
+
+## Customize keypoint format for PoseDataset
+
+MMAction2 currently supports three keypoint formats: `coco`, `nturgb+d` and `openpose`. If you use one of these formats, you may simply specify the corresponding format in the following modules:
+
+For Graph Convolutional Networks, such as AAGCN, STGCN, ...
+
+- `pipeline`: argument `dataset` in `JointToBone`.
+- `backbone`: argument `graph_cfg` in Graph Convolutional Networks.
+
+For PoseC3D:
+
+- `pipeline`: In `Flip`, specify `left_kp` and `right_kp` based on the symmetrical relationship between keypoints.
+- `pipeline`: In `GeneratePoseTarget`, specify `skeletons`, `left_limb`, `right_limb` if `with_limb` is `True`, and `left_kp`, `right_kp` if `with_kp` is `True`.
+
+If using a custom keypoint format, it is necessary to include a new graph layout in both the `backbone` and `pipeline`. This layout will define the keypoints and their connection relationship.
+
+Taking the `coco` dataset as an example, we define a layout named `coco` in `Graph`. The `inward` connections of this layout comprise all node connections, with each **centripetal** connection consisting of a tuple of nodes. Additional settings for `coco` include specifying the number of nodes as `17` the `node 0` as the central node.
+
+```python
+
+self.num_node = 17
+self.inward = [(15, 13), (13, 11), (16, 14), (14, 12), (11, 5),
+                (12, 6), (9, 7), (7, 5), (10, 8), (8, 6), (5, 0),
+                (6, 0), (1, 0), (3, 1), (2, 0), (4, 2)]
+self.center = 0
+```
+
+Similarly, we define the `pairs` in `JointToBone`, adding a bone of `(0, 0)` to align the number of bones to the nodes. The `pairs` of coco dataset are shown below, and the order of `pairs` in `JointToBone` is irrelevant.
+
+```python
+
+self.pairs = ((0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 0),
+              (6, 0), (7, 5), (8, 6), (9, 7), (10, 8), (11, 0),
+              (12, 0), (13, 11), (14, 12), (15, 13), (16, 14))
+```
+
+To use your custom keypoint format, simply define the aforementioned settings as your graph structure and specify them in your config file as shown below, In this example, we will use `STGCN`, with `n` denoting the number of classes and `custom_dataset` defined in `Graph` and `JointToBone`.
+
+```python
+model = dict(
+  type='RecognizerGCN',
+  backbone=dict(
+      type='STGCN', graph_cfg=dict(layout='custom_dataset', mode='stgcn_spatial')),
+  cls_head=dict(type='GCNHead', num_classes=n, in_channels=256))
+
+train_pipeline = [
+  ...
+  dict(type='GenSkeFeat', dataset='custom_dataset'),
+  ...]
+
+val_pipeline = [
+  ...
+  dict(type='GenSkeFeat', dataset='custom_dataset'),
+  ...]
+
+test_pipeline = [
+  ...
+  dict(type='GenSkeFeat', dataset='custom_dataset'),
+  ...]
+
+```
diff --git a/docs/en/advanced_guides/customize_logging.md b/docs/en/advanced_guides/customize_logging.md
new file mode 100644
index 0000000000000000000000000000000000000000..145313a9a7ffc58bebea8fdd54c6a0b36dff14f3
--- /dev/null
+++ b/docs/en/advanced_guides/customize_logging.md
@@ -0,0 +1,163 @@
+# Customize Logging
+
+MMAction2 produces a lot of logs during the running process, such as loss, iteration time, learning rate, etc. In this section, we will introduce you how to output custom log. More details about the logging system, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/logging.html).
+
+- [Customize Logging](#customize-logging)
+  - [Flexible Logging System](#flexible-logging-system)
+  - [Customize log](#customize-log)
+  - [Export the debug log](#export-the-debug-log)
+
+## Flexible Logging System
+
+The MMAction2 logging system is configured by the `LogProcessor` in [default_runtime](https://github.com/open-mmlab/mmaction2/tree/main/configs/_base_/default_runtime.py) by default, which is equivalent to:
+
+```python
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+```
+
+By default, the `LogProcessor` captures all fields that begin with `loss` returned by `model.forward`. For instance, in the following model, `loss1` and `loss2` will be logged automatically without any additional configuration.
+
+```python
+from mmengine.model import BaseModel
+
+class ToyModel(BaseModel):
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear = nn.Linear(1, 1)
+
+    def forward(self, img, label, mode):
+        feat = self.linear(img)
+        loss1 = (feat - label).pow(2)
+        loss2 = (feat - label).abs()
+        return dict(loss1=loss1, loss2=loss2)
+```
+
+The output log follows the following format:
+
+```
+08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][10/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0019  data_time: 0.0004  loss1: 0.8381  loss2: 0.9007  loss: 1.7388
+08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][20/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0029  data_time: 0.0010  loss1: 0.1978  loss2: 0.4312  loss: 0.6290
+```
+
+`LogProcessor` will output the log in the following format:
+
+- The prefix of the log:
+  - epoch mode(`by_epoch=True`): `Epoch(train) [{current_epoch}/{current_iteration}]/{dataloader_length}`
+  - iteration mode(`by_epoch=False`): `Iter(train) [{current_iteration}/{max_iteration}]`
+- Learning rate (`lr`): The learning rate of the last iteration.
+- Time:
+  - `time`: The averaged time for inference of the last `window_size` iterations.
+  - `data_time`: The averaged time for loading data of the last `window_size` iterations.
+  - `eta`: The estimated time of arrival to finish the training.
+- Loss: The averaged loss output by model of the last `window_size` iterations.
+
+```{warning}
+log_processor outputs the epoch based log by default(`by_epoch=True`). To get an expected log matched with the `train_cfg`, we should set the same value for `by_epoch` in `train_cfg` and `log_processor`.
+```
+
+Based on the rules above, the code snippet will count the average value of the loss1 and the loss2 every 20 iterations. More types of statistical methods, please refer to [mmengine.runner.LogProcessor](mmengine.runner.LogProcessor).
+
+## Customize log
+
+The logging system could not only log the `loss`, `lr`, .etc but also collect and output the custom log. For example, if we want to statistic the intermediate loss:
+
+The `ToyModel` calculate `loss_tmp` in forward, but don't save it into the return dict.
+
+```python
+from mmengine.logging import MessageHub
+
+class ToyModel(BaseModel):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear = nn.Linear(1, 1)
+
+    def forward(self, img, label, mode):
+        feat = self.linear(img)
+        loss_tmp = (feat - label).abs()
+        loss = loss_tmp.pow(2)
+
+        message_hub = MessageHub.get_current_instance()
+        # update the intermediate `loss_tmp` in the message hub
+        message_hub.update_scalar('train/loss_tmp', loss_tmp.sum())
+        return dict(loss=loss)
+```
+
+Add the `loss_tmp` into the config:
+
+```python
+log_processor = dict(
+    type='LogProcessor',
+    window_size=20,
+    by_epoch=True,
+    custom_cfg=[
+        # statistic the loss_tmp with the averaged value
+            dict(
+                data_src='loss_tmp',
+                window_size=20,
+                method_name='mean')
+        ])
+```
+
+The `loss_tmp` will be added to the output log:
+
+```
+08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][10/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0026  data_time: 0.0008  loss_tmp: 0.0097  loss: 0.0000
+08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][20/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0028  data_time: 0.0013  loss_tmp: 0.0065  loss: 0.0000
+```
+
+## Export the debug log
+
+To export the debug log to the `work_dir`, you can set log_level in config file as follows:
+
+```
+log_level='DEBUG'
+```
+
+```
+08/21 18:16:22 - mmengine - DEBUG - Get class `LocalVisBackend` from "vis_backend" registry in "mmengine"
+08/21 18:16:22 - mmengine - DEBUG - An `LocalVisBackend` instance is built from registry, its implementation can be found in mmengine.visualization.vis_backend
+08/21 18:16:22 - mmengine - DEBUG - Get class `RuntimeInfoHook` from "hook" registry in "mmengine"
+08/21 18:16:22 - mmengine - DEBUG - An `RuntimeInfoHook` instance is built from registry, its implementation can be found in mmengine.hooks.runtime_info_hook
+08/21 18:16:22 - mmengine - DEBUG - Get class `IterTimerHook` from "hook" registry in "mmengine"
+...
+```
+
+Besides, logs of different ranks will be saved in `debug` mode if you are training your model with the shared storage. The hierarchy of the log is as follows:
+
+```text
+./tmp
+├── tmp.log
+├── tmp_rank1.log
+├── tmp_rank2.log
+├── tmp_rank3.log
+├── tmp_rank4.log
+├── tmp_rank5.log
+├── tmp_rank6.log
+└── tmp_rank7.log
+...
+└── tmp_rank63.log
+```
+
+The log of Multiple machines with independent storage:
+
+```text
+# device: 0:
+work_dir/
+└── exp_name_logs
+    ├── exp_name.log
+    ├── exp_name_rank1.log
+    ├── exp_name_rank2.log
+    ├── exp_name_rank3.log
+    ...
+    └── exp_name_rank7.log
+
+# device: 7:
+work_dir/
+└── exp_name_logs
+    ├── exp_name_rank56.log
+    ├── exp_name_rank57.log
+    ├── exp_name_rank58.log
+    ...
+    └── exp_name_rank63.log
+```
diff --git a/docs/en/advanced_guides/customize_models.md b/docs/en/advanced_guides/customize_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..3aa02f4cb25f963545aa03f4b49eebbf3a3a189e
--- /dev/null
+++ b/docs/en/advanced_guides/customize_models.md
@@ -0,0 +1,3 @@
+# Customize Models
+
+coming soon...
diff --git a/docs/en/advanced_guides/customize_optimizer.md b/docs/en/advanced_guides/customize_optimizer.md
new file mode 100644
index 0000000000000000000000000000000000000000..2e95db15c5c857835f5a3b812b217f546171c03d
--- /dev/null
+++ b/docs/en/advanced_guides/customize_optimizer.md
@@ -0,0 +1,340 @@
+# Customize Optimizer
+
+In this tutorial, we will introduce some methods about how to build the optimizer and learning rate scheduler for your tasks.
+
+- [Customize Optimizer](#customize-optimizer)
+  - [Build optimizers using optim_wrapper](#build-optimizers-using-optim_wrapper)
+    - [Use optimizers supported by PyTorch](#use-optimizers-supported-by-pytorch)
+    - [Parameter-wise finely configuration](#parameter-wise-finely-configuration)
+    - [Gradient clipping](#gradient-clipping)
+    - [Gradient accumulation](#gradient-accumulation)
+  - [Customize parameter schedules](#customize-parameter-schedules)
+    - [Customize learning rate schedules](#customize-learning-rate-schedules)
+    - [Customize momentum schedules](#customize-momentum-schedules)
+  - [Add new optimizers or constructors](#add-new-optimizers-or-constructors)
+    - [Add new optimizers](#add-new-optimizers)
+      - [1. Implement a new optimizer](#1-implement-a-new-optimizer)
+      - [2. Import the optimizer](#2-import-the-optimizer)
+      - [3. Specify the optimizer in the config file](#3-specify-the-optimizer-in-the-config-file)
+    - [Add new optimizer constructors](#add-new-optimizer-constructors)
+
+## Build optimizers using optim_wrapper
+
+We use the `optim_wrapper` field to configure the strategies of optimization, which includes choices of the optimizer, parameter-wise configurations, gradient clipping and accumulation. A simple example can be:
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.0003, weight_decay=0.0001)
+)
+```
+
+In the above example, a SGD optimizer with learning rate 0.0003 and weight decay 0.0001 is built.
+
+### Use optimizers supported by PyTorch
+
+We support all the optimizers implemented by PyTorch. To use a different optimizer, just need to change the `optimizer` field of config files. For example, if you want to use `torch.optim.Adam`, the modification in the config file could be as the following.
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer = dict(
+        type='Adam',
+        lr=0.001,
+        betas=(0.9, 0.999),
+        eps=1e-08,
+        weight_decay=0,
+        amsgrad=False),
+)
+```
+
+First we need to change the value of `type` to the desired optimizer name supported in `torch.optim`. Next we add necessary arguments of this optimizer to the `optimizer` field. The above config will build the following optimizer:
+
+```python
+torch.optim.Adam(lr=0.001,
+                 betas=(0.9, 0.999),
+                 eps=1e-08,
+                 weight_decay=0,
+                 amsgrad=False)
+```
+
+### Parameter-wise finely configuration
+
+Some models may have parameter-specific settings for optimization, for example, no weight decay to the BatchNorm layers or using different learning rates for different network layers.
+To finely configure them, we can use the `paramwise_cfg` argument in `optim_wrapper`.
+
+- **Set different hyper-parameter multipliers for different types of parameters.**
+
+  For instance, we can set `norm_decay_mult=0.` in `paramwise_cfg` to change the weight decay of weight and bias of normalization layers to zero.
+
+  ```python
+  optim_wrapper = dict(
+      optimizer=dict(type='SGD', lr=0.8, weight_decay=1e-4),
+      paramwise_cfg=dict(norm_decay_mult=0.))
+  ```
+
+  More types of parameters are supported to configured, list as follow:
+
+  - `lr_mult`: Multiplier for learning rate of all parameters.
+  - `decay_mult`: Multiplier for weight decay of all parameters.
+  - `bias_lr_mult`: Multiplier for learning rate of bias (Not include normalization layers' biases and deformable convolution layers' offsets). Defaults to 1.
+  - `bias_decay_mult`: Multiplier for weight decay of bias (Not include normalization layers' biases and deformable convolution layers' offsets). Defaults to 1.
+  - `norm_decay_mult`: Multiplier for weight decay of weigh and bias of normalization layers. Defaults to 1.
+  - `dwconv_decay_mult`: Multiplier for weight decay of depth-wise convolution layers. Defaults to 1.
+  - `bypass_duplicate`: Whether to bypass duplicated parameters. Defaults to `False`.
+  - `dcn_offset_lr_mult`: Multiplier for learning rate of deformable convolution layers. Defaults to 1.
+
+- **Set different hyper-parameter multipliers for specific parameters.**
+
+  MMAction2 can use `custom_keys` in `paramwise_cfg` to specify different parameters to use different learning rates or weight decay.
+
+  For example, to set all learning rates and weight decays of `backbone.layer0` to 0, the rest of `backbone` remains the same as the optimizer and the learning rate of `head` to 0.001, use the configs below.
+
+  ```python
+  optim_wrapper = dict(
+      optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+      paramwise_cfg=dict(
+          custom_keys={
+              'backbone.layer0': dict(lr_mult=0, decay_mult=0),
+              'backbone': dict(lr_mult=1),
+              'head': dict(lr_mult=0.1)
+          }))
+  ```
+
+### Gradient clipping
+
+During the training process, the loss function may get close to a cliffy region and cause gradient explosion. And gradient clipping is helpful to stabilize the training process. More introduction can be found in [this page](https://paperswithcode.com/method/gradient-clipping).
+
+Currently we support `clip_grad` option in `optim_wrapper` for gradient clipping, refers to [PyTorch Documentation](torch.nn.utils.clip_grad_norm_).
+
+Here is an example:
+
+```python
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+    # norm_type: type of the used p-norm, here norm_type is 2.
+    clip_grad=dict(max_norm=35, norm_type=2))
+```
+
+### Gradient accumulation
+
+When computing resources are lacking, the batch size can only be set to a small value, which may affect the performance of models. Gradient accumulation can be used to solve this problem. We support `accumulative_counts` option in `optim_wrapper` for gradient accumulation.
+
+Here is an example:
+
+```python
+train_dataloader = dict(batch_size=64)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+    accumulative_counts=4)
+```
+
+Indicates that during training, back-propagation is performed every 4 iters. And the above is equivalent to:
+
+```python
+train_dataloader = dict(batch_size=256)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001))
+```
+
+## Customize parameter schedules
+
+In training, the optimzation parameters such as learing rate, momentum, are usually not fixed but changing through iterations or epochs. PyTorch supports several learning rate schedulers, which are not sufficient for complex strategies. In MMAction2, we provide `param_scheduler` for better controls of different parameter schedules.
+
+### Customize learning rate schedules
+
+Learning rate schedulers are widely used to improve performance. We support most of the PyTorch schedulers, including `ExponentialLR`, `LinearLR`, `StepLR`, `MultiStepLR`, etc.
+
+All available learning rate scheduler can be found {external+mmengine:ref}`here <scheduler>`, and the
+names of learning rate schedulers end with `LR`.
+
+- **Single learning rate schedule**
+
+  In most cases, we use only one learning rate schedule for simplicity. For instance, [`MultiStepLR`](mmengine.optim.MultiStepLR) is used as the default learning rate schedule for ResNet. Here, `param_scheduler` is a dictionary.
+
+  ```python
+  param_scheduler = dict(
+      type='MultiStepLR',
+      by_epoch=True,
+      milestones=[100, 150],
+      gamma=0.1)
+  ```
+
+  Or, we want to use the [`CosineAnnealingLR`](mmengine.optim.CosineAnnealingLR) scheduler to decay the learning rate:
+
+  ```python
+  param_scheduler = dict(
+      type='CosineAnnealingLR',
+      by_epoch=True,
+      T_max=num_epochs)
+  ```
+
+- **Multiple learning rate schedules**
+
+  In some of the training cases, multiple learning rate schedules are applied for higher accuracy. For example ,in the early stage, training is easy to be volatile, and warmup is a technique to reduce volatility.
+  The learning rate will increase gradually from a minor value to the expected value by warmup and decay afterwards by other schedules.
+
+  In MMAction2, simply combines desired schedules in `param_scheduler` as a list can achieve the warmup strategy.
+
+  Here are some examples:
+
+  1. linear warmup during the first 50 iters.
+
+  ```python
+    param_scheduler = [
+        # linear warm-up by iters
+        dict(type='LinearLR',
+            start_factor=0.001,
+            by_epoch=False,  # by iters
+            end=50),  # only warm up for first 50 iters
+        # main learing rate schedule
+        dict(type='MultiStepLR',
+            by_epoch=True,
+            milestones=[8, 11],
+            gamma=0.1)
+    ]
+  ```
+
+  2. linear warmup and update lr by iter during the first 10 epochs.
+
+  ```python
+    param_scheduler = [
+        # linear warm-up by epochs in [0, 10) epochs
+        dict(type='LinearLR',
+            start_factor=0.001,
+            by_epoch=True,
+            end=10,
+            convert_to_iter_based=True,  # Update learning rate by iter.
+        ),
+        # use CosineAnnealing schedule after 10 epochs
+        dict(type='CosineAnnealingLR', by_epoch=True, begin=10)
+    ]
+  ```
+
+  Notice that, we use `begin` and `end` arguments here to assign the valid range, which is \[`begin`, `end`) for this schedule. And the range unit is defined by `by_epoch` argument. If not specified, the `begin` is 0 and the `end` is the max epochs or iterations.
+
+  If the ranges for all schedules are not continuous, the learning rate will stay constant in ignored range, otherwise all valid schedulers will be executed in order in a specific stage, which behaves the same as PyTorch [`ChainedScheduler`](torch.optim.lr_scheduler.ChainedScheduler).
+
+### Customize momentum schedules
+
+We support using momentum schedulers to modify the optimizer's momentum according to learning rate, which could make the loss converge in a faster way. The usage is the same as learning rate schedulers.
+
+All available learning rate scheduler can be found {external+mmengine:ref}`here <scheduler>`, and the
+names of momentum rate schedulers end with `Momentum`.
+
+Here is an example:
+
+```python
+param_scheduler = [
+    # the lr scheduler
+    dict(type='LinearLR', ...),
+    # the momentum scheduler
+    dict(type='LinearMomentum',
+         start_factor=0.001,
+         by_epoch=False,
+         begin=0,
+         end=1000)
+]
+```
+
+## Add new optimizers or constructors
+
+This part will modify the MMAction2 source code or add code to the MMAction2 framework, beginners can skip it.
+
+### Add new optimizers
+
+In academic research and industrial practice, it may be necessary to use optimization methods not implemented by MMAction2, and you can add them through the following methods.
+
+#### 1. Implement a new optimizer
+
+Assume you want to add an optimizer named `MyOptimizer`, which has arguments `a`, `b`, and `c`.
+You need to create a new file under `mmaction/engine/optimizers`, and implement the new optimizer in the file, for example, in `mmaction/engine/optimizers/my_optimizer.py`:
+
+```python
+from torch.optim import Optimizer
+from mmaction.registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class MyOptimizer(Optimizer):
+
+    def __init__(self, a, b, c):
+        ...
+
+    def step(self, closure=None):
+        ...
+```
+
+#### 2. Import the optimizer
+
+To find the above module defined above, this module should be imported during the running. First import it in the `mmaction/engine/optimizers/__init__.py` to add it into the `mmaction.engine` package.
+
+```python
+# In mmaction/engine/optimizers/__init__.py
+...
+from .my_optimizer import MyOptimizer # MyOptimizer maybe other class name
+
+__all__ = [..., 'MyOptimizer']
+```
+
+During running, we will automatically import the `mmaction.engine` package and register the `MyOptimizer` at the same time.
+
+#### 3. Specify the optimizer in the config file
+
+Then you can use `MyOptimizer` in the `optim_wrapper.optimizer` field of config files.
+
+```python
+optim_wrapper = dict(
+    optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value))
+```
+
+### Add new optimizer constructors
+
+Some models may have some parameter-specific settings for optimization, like different weight decay rate for all `BatchNorm` layers.
+
+Although we already can use [the `optim_wrapper.paramwise_cfg` field](#parameter-wise-finely-configuration) to
+configure various parameter-specific optimizer settings. It may still not cover your need.
+
+Of course, you can modify it. By default, we use the [`DefaultOptimWrapperConstructor`](mmengine.optim.DefaultOptimWrapperConstructor)
+class to deal with the construction of optimizer. And during the construction, it fine-grainedly configures the optimizer settings of
+different parameters according to the `paramwise_cfg`，which could also serve as a template for new optimizer constructor.
+
+You can overwrite these behaviors by add new optimizer constructors.
+
+```python
+# In mmaction/engine/optimizers/my_optim_constructor.py
+from mmengine.optim import DefaultOptimWrapperConstructor
+from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class MyOptimWrapperConstructor:
+
+    def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
+        ...
+
+    def __call__(self, model):
+        ...
+```
+
+And then, import it and use it almost like [the optimizer tutorial](#add-new-optimizers).
+
+1. Import it in the `mmaction/engine/optimizers/__init__.py` to add it into the `mmaction.engine` package.
+
+   ```python
+   # In mmaction/engine/optimizers/__init__.py
+   ...
+   from .my_optim_constructor import MyOptimWrapperConstructor
+
+   __all__ = [..., 'MyOptimWrapperConstructor']
+   ```
+
+2. Use `MyOptimWrapperConstructor` in the `optim_wrapper.constructor` field of config files.
+
+   ```python
+   optim_wrapper = dict(
+       constructor=dict(type='MyOptimWrapperConstructor'),
+       optimizer=...,
+       paramwise_cfg=...,
+   )
+   ```
diff --git a/docs/en/advanced_guides/customize_pipeline.md b/docs/en/advanced_guides/customize_pipeline.md
new file mode 100644
index 0000000000000000000000000000000000000000..ed33bbbb7682253cd005b9cf73b9127618107f87
--- /dev/null
+++ b/docs/en/advanced_guides/customize_pipeline.md
@@ -0,0 +1,148 @@
+# Customize Data Pipeline
+
+In this tutorial, we will introduce some methods about how to build the data pipeline (i.e., data transformations) for your tasks.
+
+- [Customize Data Pipeline](#customize-data-pipeline)
+  - [Design of Data Pipeline](#design-of-data-pipeline)
+  - [Modify the Training/Testing Pipeline](#modify-the-trainingtest-pipeline)
+    - [Loading](#loading)
+    - [Sampling Frames and Other Processing](#sampling-frames-and-other-processing)
+    - [Formatting](#formatting)
+  - [Add New Data Transforms](#add-new-data-transforms)
+
+## Design of Data Pipeline
+
+The data pipeline refers to the procedure of handling the data sample dict when indexing a sample from the dataset, and comprises a series of data transforms. Each data transform accepts a `dict` as input, processes it, and produces a `dict` as output for the subsequent data transform in the sequence.
+
+Below is an example data pipeline for training SlowFast on Kinetics using `VideoDataset`. The pipeline initially employs [`decord`](https://github.com/dmlc/decord) to read the raw videos and randomly sample one video clip, which comprises `32` frames with a frame interval of `2`. Subsequently, it applies random resized crop and random horizontal flip to all frames before formatting the data shape as `NCTHW`, which is `(1, 3, 32, 224, 224)` in this example.
+
+```python
+train_pipeline = [
+    dict(type='DecordInit',),
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+```
+
+A comprehensive list of all available data transforms in MMAction2 can be found in the [mmaction.datasets.transforms](mmaction.datasets.transforms).
+
+## Modify the Training/Testing Pipeline
+
+The data pipeline in MMAction2 is highly adaptable, as nearly every step of the data preprocessing can be configured from the config file. However, the wide array of options may be overwhelming for some users.
+
+Below are some general practices and guidance for building a data pipeline for action recognition tasks.
+
+### Loading
+
+At the beginning of a data pipeline, it is customary to load videos. However, if the frames have already been extracted, you should utilize `RawFrameDecode` and modify the dataset type to `RawframeDataset`.
+
+```python
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+```
+
+If you need to load data from files with distinct formats (e.g., `pkl`, `bin`, etc.) or from specific locations, you may create a new loading transform and include it at the beginning of the data pipeline. Please refer to [Add New Data Transforms](#add-new-data-transforms) for more details.
+
+### Sampling Frames and Other Processing
+
+During training and testing, we may have different strategies to sample frames from the video.
+
+For instance, when testing SlowFast, we uniformly sample multiple clips as follows:
+
+```python
+test_pipeline = [
+    ...
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=10,
+        test_mode=True),
+    ...
+]
+```
+
+In the above example, 10 video clips, each comprising 32 frames, will be uniformly sampled from each video. `test_mode=True` is employed to accomplish this, as opposed to random sampling during training.
+
+Another example involves `TSN/TSM` models, which sample multiple segments from the video:
+
+```python
+train_pipeline = [
+    ...
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    ...
+]
+```
+
+Typically, the data augmentations in the data pipeline handles only video-level transforms, such as resizing or cropping, but not transforms like video normalization or mixup/cutmix. This is because we can do video normalization and mixup/cutmix on batched video data
+to accelerate processing using GPUs. To configure video normalization and mixup/cutmix, please use the [mmaction.models.utils.data_preprocessor](mmaction.models.utils.data_preprocessor).
+
+### Formatting
+
+Formatting involves collecting training data from the data information dict and converting it into a format that is compatible with the model.
+
+In most cases, you can simply employ [`PackActionInputs`](mmaction.datasets.transforms.PackActionInputs), and it will
+convert the image in `NumPy Array` format to `PyTorch Tensor`, and pack the ground truth category information and
+other meta information as a dict-like object [`ActionDataSample`](mmaction.structures.ActionDataSample).
+
+```python
+train_pipeline = [
+    ...
+    dict(type='PackActionInputs'),
+]
+```
+
+## Add New Data Transforms
+
+1. To create a new data transform, write a new transform class in a python file named, for example, `my_transforms.py`. The data transform classes must inherit
+   the [`mmcv.transforms.BaseTransform`](mmcv.transforms.BaseTransform) class and override the `transform` method which takes a `dict` as input and returns a `dict`. Finally, place `my_transforms.py` in the folder `mmaction/datasets/transforms/`.
+
+   ```python
+   from mmcv.transforms import BaseTransform
+   from mmaction.datasets import TRANSFORMS
+
+   @TRANSFORMS.register_module()
+   class MyTransform(BaseTransform):
+        def __init__(self, msg):
+            self.msg = msg
+
+       def transform(self, results):
+           # Modify the data information dict `results`.
+           print(msg, 'MMAction2.')
+           return results
+   ```
+
+2. Import the new class in the `mmaction/datasets/transforms/__init__.py`.
+
+   ```python
+   ...
+   from .my_transform import MyTransform
+
+   __all__ = [
+       ..., 'MyTransform'
+   ]
+   ```
+
+3. Use it in config files.
+
+   ```python
+   train_pipeline = [
+       ...
+       dict(type='MyTransform', msg='Hello!'),
+       ...
+   ]
+   ```
diff --git a/docs/en/advanced_guides/dataflow.md b/docs/en/advanced_guides/dataflow.md
new file mode 100644
index 0000000000000000000000000000000000000000..915d888ecddeca9fd635fa1b55db4f7f2d8aa938
--- /dev/null
+++ b/docs/en/advanced_guides/dataflow.md
@@ -0,0 +1,3 @@
+# Dataflow in MMAction2
+
+coming soon...
diff --git a/docs/en/advanced_guides/depoly.md b/docs/en/advanced_guides/depoly.md
new file mode 100644
index 0000000000000000000000000000000000000000..82fab764a856d26c5575a22f24743411b4e54a5f
--- /dev/null
+++ b/docs/en/advanced_guides/depoly.md
@@ -0,0 +1,3 @@
+# How to deploy MMAction2 models
+
+coming soon...
diff --git a/docs/en/api.rst b/docs/en/api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f3f688462bc92067c883eb4c61bc9246c271f659
--- /dev/null
+++ b/docs/en/api.rst
@@ -0,0 +1,140 @@
+mmaction.apis
+--------------
+.. automodule:: mmaction.apis
+    :members:
+
+mmaction.datasets
+--------------
+
+datasets
+^^^^^^^^^^
+.. automodule:: mmaction.datasets
+    :members:
+
+transforms
+^^^^^^^^^^^^
+.. automodule:: mmaction.datasets.transforms
+    :members:
+
+mmaction.engine
+--------------
+
+hooks
+^^^^^^^^^^
+.. automodule:: mmaction.engine.hooks
+    :members:
+
+optimizers
+^^^^^^^^^^^^^^^
+.. automodule:: mmaction.engine.optimizers
+    :members:
+
+runner
+^^^^^^^^^^
+.. automodule:: mmaction.engine.runner
+    :members:
+
+
+mmaction.evaluation
+--------------------
+
+functional
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.evaluation.functional
+    :members:
+
+metrics
+^^^^^^^^^^
+.. automodule:: mmaction.evaluation.metrics
+    :members:
+
+
+mmaction.models
+--------------
+
+backbones
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.backbones
+    :members:
+
+common
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.common
+    :members:
+
+data_preprocessors
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.data_preprocessors
+    :members:
+
+heads
+^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.heads
+    :members:
+
+localizers
+^^^^^^^^^^
+.. automodule:: mmaction.models.localizers
+    :members:
+
+
+losses
+^^^^^^^^^^
+.. automodule:: mmaction.models.losses
+    :members:
+
+necks
+^^^^^^^^^^^^
+.. automodule:: mmaction.models.necks
+    :members:
+
+roi_heads
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.roi_heads
+    :members:
+
+recognizers
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.seg_heads
+    :members:
+
+task_modules
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.task_modules
+    :members:
+
+
+utils
+^^^^^^^^^^
+.. automodule:: mmaction.models.utils
+    :members:
+
+
+mmaction.structures
+--------------------
+
+structures
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.structures
+    :members:
+
+bbox
+^^^^^^^^^^
+.. automodule:: mmaction.structures.bbox
+    :members:
+
+
+mmaction.testing
+----------------
+.. automodule:: mmaction.testing
+    :members:
+
+mmaction.visualization
+--------------------
+.. automodule:: mmaction.visualization
+    :members:
+
+mmaction.utils
+--------------
+.. automodule:: mmaction.utils
+    :members:
diff --git a/docs/en/conf.py b/docs/en/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b4383705944ef0c34601904b31dcd8f39423954
--- /dev/null
+++ b/docs/en/conf.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import subprocess
+import sys
+
+import pytorch_sphinx_theme
+
+sys.path.insert(0, os.path.abspath('../..'))
+
+# -- Project information -----------------------------------------------------
+
+project = 'MMAction2'
+copyright = '2020, OpenMMLab'
+author = 'MMAction2 Authors'
+version_file = '../.././mmaction/version.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+# The full version, including alpha/beta/rc tags
+release = get_version()
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'myst_parser',
+    'sphinx_markdown_tables',
+    'sphinx_copybutton',
+    'sphinx_tabs.tabs',
+    'notfound.extension',
+    'sphinxcontrib.jquery',
+]
+
+# numpy and torch are required
+autodoc_mock_imports = ['mmaction.version', 'PIL']
+
+copybutton_prompt_text = r'>>> |\.\.\. '
+copybutton_prompt_is_regexp = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# -- Options for HTML output -------------------------------------------------
+source_suffix = {'.rst': 'restructuredtext', '.md': 'markdown'}
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'pytorch_sphinx_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+
+html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+html_theme_options = {
+    # 'logo_url': 'https://mmaction2.readthedocs.io/en/latest/',
+    'menu': [
+        {
+            'name':
+            'Tutorial',
+            'url':
+            'https://colab.research.google.com/github/'
+            'open-mmlab/mmaction2/blob/master/demo/mmaction2_tutorial.ipynb'
+        },
+        {
+            'name': 'GitHub',
+            'url': 'https://github.com/open-mmlab/mmaction2'
+        },
+        {
+            'name':
+            'Upstream',
+            'children': [{
+                'name':
+                'MMCV',
+                'url':
+                'https://github.com/open-mmlab/mmcv',
+                'description':
+                'Foundational library for computer vision'
+            }, {
+                'name':
+                'MMPreTrain',
+                'url':
+                'https://github.com/open-mmlab/mmpretrain',
+                'description':
+                'Open source pre-training toolbox based on PyTorch'
+            }, {
+                'name':
+                'MMDetection',
+                'url':
+                'https://github.com/open-mmlab/mmdetection',
+                'description':
+                'Object detection toolbox and benchmark'
+            }, {
+                'name':
+                'MMPose',
+                'url':
+                'https://github.com/open-mmlab/mmpose',
+                'description':
+                'Open-source toolbox for pose estimation based on PyTorch.'
+            }]
+        },
+    ],
+    # Specify the language of shared menu
+    'menu_lang':
+    'en'
+}
+
+language = 'en'
+master_doc = 'index'
+
+html_static_path = ['_static']
+html_css_files = [
+    'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css',
+    'css/readthedocs.css'
+]
+html_js_files = [
+    'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js',
+    'js/custom.js'
+]
+
+myst_enable_extensions = ['colon_fence']
+myst_heading_anchors = 3
+
+# The not found page
+notfound_template = '404.html'
+
+
+def builder_inited_handler(app):
+    if subprocess.run(['python', './stat.py']).returncode != 0:
+        raise RuntimeError('Failed to run the script `stat.py`.')
+    if subprocess.run(['python', './project_zoo.py']).returncode != 0:
+        raise RuntimeError('Failed to run the script `project_zoo.py`.')
+    if subprocess.run(['python', './dataset_zoo.py']).returncode != 0:
+        raise RuntimeError('Failed to run the script `dataset_zoo.py`.')
+
+
+def setup(app):
+    app.connect('builder-inited', builder_inited_handler)
diff --git a/docs/en/dataset_zoo.py b/docs/en/dataset_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..6980475ede264d1d8bf59434d0e18cf50b6a04b2
--- /dev/null
+++ b/docs/en/dataset_zoo.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+import re
+from pathlib import Path
+
+from utils import replace_link
+
+DATASETS_ROOT = Path('dataset_zoo')  # Path to save generated paper pages.
+MODELZOO_TEMPLATE = """\
+# Dataset Zoo Summary
+
+In this page, we list [all datasets](#all-supported-datasets) we support. You can click the link to jump to the corresponding dataset pages.
+
+## All supported datasets
+
+* Number of datasets: {num_datasets}
+{dataset_msg}
+
+"""  # noqa: E501
+
+
+def generate_datasets_pages():
+    dataset_list = Path('../../tools/data').glob('*/README.md')
+    num_datasets = 0
+    dataset_msgs = []
+
+    for file in dataset_list:
+        num_datasets += 1
+
+        copy = DATASETS_ROOT / file.parent.with_suffix('.md').name
+
+        with open(file, 'r') as f:
+            content = f.read()
+
+        title = re.match(r'^# Preparing (.*)', content).group(1)
+        content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content,
+                               file)
+        content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content,
+                               file)
+        dataset_msgs.append(f'\t - [{title}]({copy})')
+
+        with open(copy, 'w') as f:
+            f.write(content)
+
+    dataset_msg = '\n'.join(dataset_msgs)
+
+    modelzoo = MODELZOO_TEMPLATE.format(
+        num_datasets=num_datasets,
+        dataset_msg=dataset_msg,
+    )
+
+    with open('datasetzoo_statistics.md', 'w') as f:
+        f.write(modelzoo)
+
+
+DATASETS_ROOT.mkdir(exist_ok=True)
+generate_datasets_pages()
diff --git a/docs/en/docutils.conf b/docs/en/docutils.conf
new file mode 100644
index 0000000000000000000000000000000000000000..ddd79c377666db4a615151f0676f7fec32d38359
--- /dev/null
+++ b/docs/en/docutils.conf
@@ -0,0 +1,2 @@
+[html writers]
+table_style: colwidths-auto
diff --git a/docs/en/get_started/contribution_guide.md b/docs/en/get_started/contribution_guide.md
new file mode 100644
index 0000000000000000000000000000000000000000..452cc17cd2cff980e329a830c91aaa39d8c734ba
--- /dev/null
+++ b/docs/en/get_started/contribution_guide.md
@@ -0,0 +1,61 @@
+# How to contribute to MMAction2
+
+All kinds of contributions are welcome, including but not limited to the following.
+
+- Fixes (typo, bugs)
+- New features and components
+- Add documentation or translate the documentation into other languages
+- Add new project (Recommended) about video understanding algorithm with less restriction, refer to [here](../projectzoo.md) for details
+
+## Workflow
+
+1. Fork and pull the latest mmaction2
+2. Checkout a new branch with a meaningful name (do not use main branch for PRs)
+3. Commit your changes
+4. Create a PR
+
+```{note}
+- If you plan to add some new features that involve large changes, it is encouraged to open an issue for discussion first.
+- If you are the author of some papers and would like to include your method to mmaction2, please contact us. We will much appreciate your contribution.
+```
+
+## Code style
+
+### Python
+
+We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.
+
+We use the following tools for linting and formatting:
+
+- [flake8](http://flake8.pycqa.org/en/latest/): linter
+- [yapf](https://github.com/google/yapf): formatter
+- [isort](https://github.com/timothycrosley/isort): sort imports
+- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.
+- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
+- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.
+
+Style configurations of yapf and isort can be found in [setup.cfg](https://github.com/open-mmlab/mmaction2/blob/main/setup.cfg).
+
+We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
+fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.
+The config for a pre-commit hook is stored in [.pre-commit-config](https://github.com/open-mmlab/mmaction2/blob/main/.pre-commit-config.yaml).
+
+After you clone the repository, you will need to install initialize pre-commit hook.
+
+```
+pip install -U pre-commit
+```
+
+From the repository folder
+
+```shell
+pre-commit install
+```
+
+After this on every commit check code linters and formatter will be enforced.
+
+> Before you create a PR, make sure that your code lints and is formatted by yapf.
+
+### C++ and CUDA
+
+We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
diff --git a/docs/en/get_started/faq.md b/docs/en/get_started/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..a0d4a84a5516ed50e25a494f03a7305f90fa7ad6
--- /dev/null
+++ b/docs/en/get_started/faq.md
@@ -0,0 +1,132 @@
+# FAQ
+
+## Outline
+
+We list some common issues faced by many users and their corresponding solutions here.
+
+- [FAQ](#faq)
+  - [Outline](#outline)
+  - [Installation](#installation)
+  - [Data](#data)
+  - [Training](#training)
+  - [Testing](#testing)
+
+Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them.
+If the contents here do not cover your issue, please create an issue using the [provided templates](https://github.com/open-mmlab/mmaction2/tree/main/.github/ISSUE_TEMPLATE/error-report.md) and make sure to fill in all required information in the template.
+
+## Installation
+
+- **"No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"**
+
+  1. Uninstall existing mmcv in the environment using `pip uninstall mmcv`
+  2. Install mmcv following the [installation instruction](https://mmcv.readthedocs.io/en/2.x/get_started/installation.html#install-mmcv)
+
+- **"OSError: MoviePy Error: creation of None failed because of the following error"**
+
+  Refer to [install.md](https://github.com/open-mmlab/mmaction2/blob/master/docs/install.md#requirements)
+
+  1. For Windows users, [ImageMagick](https://www.imagemagick.org/script/index.php) will not be automatically detected by MoviePy, there is a need to modify `moviepy/config_defaults.py` file by providing the path to the ImageMagick binary called `magick`, like `IMAGEMAGICK_BINARY = "C:\\Program Files\\ImageMagick_VERSION\\magick.exe"`
+  2. For Linux users, there is a need to modify the `/etc/ImageMagick-6/policy.xml` file by commenting out `<policy domain="path" rights="none" pattern="@*" />` to `<!-- <policy domain="path" rights="none" pattern="@*" /> -->`, if ImageMagick is not detected by moviepy.
+
+- **"Why I got the error message 'Please install XXCODEBASE to use XXX' even if I have already installed XXCODEBASE?"**
+
+  You got that error message because our project failed to import a function or a class from XXCODEBASE. You can try to run the corresponding line to see what happens. One possible reason is, for some codebases in OpenMMLAB, you need to install mmcv and mmengine before you install them. You could follow this [tutorial](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html#installation) to install them.
+
+## Data
+
+- **FileNotFound like `No such file or directory: xxx/xxx/img_00300.jpg`**
+
+  In our repo, we set `start_index=1` as default value for rawframe dataset, and `start_index=0` as default value for video dataset.
+  If users encounter FileNotFound error for the first or last frame of the data, there is a need to check the files begin with offset 0 or 1,
+  that is `xxx_00000.jpg` or `xxx_00001.jpg`, and then change the `start_index` value of data pipeline in configs.
+
+- **How should we preprocess the videos in the dataset? Resizing them to a fix size(all videos with the same height-width ratio) like `340x256` (1) or resizing them so that the short edges of all videos are of the same length (256px or 320px) (2)**
+
+  We have tried both preprocessing approaches and found (2) is a better solution in general, so we use (2) with short edge length 256px as the default preprocessing setting. We benchmarked these preprocessing approaches and you may find the results in [TSN Data Benchmark](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn) and [SlowOnly Data Benchmark](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/slowonly).
+
+- **Mismatched data pipeline items lead to errors like `KeyError: 'total_frames'`**
+
+  We have both pipeline for processing videos and frames.
+
+  **For videos**, We should decode them on the fly in the pipeline, so pairs like `DecordInit & DecordDecode`, `OpenCVInit & OpenCVDecode`, `PyAVInit & PyAVDecode` should be used for this case like [this example](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py#L14-L16).
+
+  **For Frames**, the image has been decoded offline, so pipeline item `RawFrameDecode` should be used for this case like [this example](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py#L17).
+
+  `KeyError: 'total_frames'` is caused by incorrectly using `RawFrameDecode` step for videos, since when the input is a video, it can not get the `total_frames` beforehand.
+
+## Training
+
+- **How to just use trained recognizer models for backbone pre-training?**
+
+  In order to use the pre-trained model for the whole network, the new config adds the link of pre-trained models in the `load_from`.
+
+  And to use backbone for pre-training, you can change `pretrained` value in the backbone dict of config files to the checkpoint path / url.
+  When training, the unexpected keys will be ignored.
+
+- **How to fix stages of backbone when finetuning a model?**
+
+  You can refer to [`def _freeze_stages()`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/models/backbones/resnet3d.py#L791) and [`frozen_stages`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/models/backbones/resnet3d.py#L369-L370).
+  Reminding to set `find_unused_parameters = True` in config files for distributed training or testing.
+
+  Actually, users can set `frozen_stages` to freeze stages in backbones except C3D model, since almost all backbones inheriting from `ResNet` and `ResNet3D` support the inner function `_freeze_stages()`.
+
+- **How to set memcached setting in config files?**
+
+  In MMAction2, you can pass memcached kwargs to `class DecordInit` for video dataset or `RawFrameDecode` for rawframes dataset.
+  For more details, you can refer to \[`class FileClient`\] in MMEngine for more details.
+  Here is an example to use memcached for rawframes dataset:
+
+  ```python
+  mc_cfg = dict(server_list_cfg='server_list_cfg', client_cfg='client_cfg', sys_path='sys_path')
+
+  train_pipeline = [
+    ...
+    dict(type='RawFrameDecode', io_backend='memcached', **mc_cfg),
+    ...
+  ]
+  ```
+
+- **How to set `load_from` value in config files to finetune models?**
+
+  In MMAction2, We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](https://github.com/open-mmlab/mmaction2/tree/main/docs/en/user_guides/config.md),
+  users can directly change it by setting `load_from` in their configs.
+
+- **How to use `RawFrameDataset` for training?**
+
+  In MMAction2 1.x version, most of the configs take `VideoDataset` as the default dataset type, which is much more friendly to file storage. If you want to use `RawFrameDataset` instead, there are two steps to modify:
+
+  - Dataset:
+    modify dataset in `train_dataloader`/`val_dataloader`/`test_dataloader` from
+
+    ```
+    dataset=dict(
+        type=VideoDataset,
+        data_prefix=dict(video=xxx),
+        ...)
+    ```
+
+    to
+
+    ```
+    dataset=dict(
+        type=RawFrameDataset,
+        data_prefix=dict(img=xxx),
+        filename_tmpl='{:05}.jpg',
+        ...)
+    ```
+
+    remaining fields of `dataset` don't need to be modified. Please make sure that `filename_tmpl` is matching with your frame data, and you can refer to [config document](../user_guides/config.md) for more details about config file.
+
+  - Transforms: delete `dict(type='DecordInit', **file_client_args)`, modify `dict(type='DecordDecode')` to `dict(type='RawFrameDecode', **file_client_args)` in `train_pipeline`/`val_pipeline`/`test_pipeline`, and please make sure that `file_client_args = dict(io_backend='disk')` has been defined in your config.
+
+  For more modifications about customizing datasets, please refer to [prepare dataset](../user_guides/prepare_dataset.md) and [customize dataset](../advanced_guides/customize_dataset.md).
+
+## Testing
+
+- **How to make predicted score normalized by softmax within \[0, 1\]?**
+
+  change this in the config, make `model.cls_head.average_clips = 'prob'`.
+
+- **What if the model is too large and the GPU memory can not fit even only one testing sample?**
+
+  By default, the 3d models are tested with 10clips x 3crops, which are 30 views in total. For extremely large models, the GPU memory can not fit even only one testing sample (cuz there are 30 views). To handle this, you can set `max_testing_views=n` in `model['test_cfg']` of the config file. If so, n views will be used as a batch during forwarding to save GPU memory used.
diff --git a/docs/en/get_started/guide_to_framework.md b/docs/en/get_started/guide_to_framework.md
new file mode 100644
index 0000000000000000000000000000000000000000..790f3895a0557f8a60a38d332c6c8d73bc00862d
--- /dev/null
+++ b/docs/en/get_started/guide_to_framework.md
@@ -0,0 +1,761 @@
+# A 20-Minute Guide to MMAction2 FrameWork
+
+In this tutorial, we will demonstrate the overall architecture of our `MMACTION2 1.0` through a step-by-step example of video action recognition.
+
+The structure of this tutorial is as follows:
+
+- [A 20-Minute Guide to MMAction2 FrameWork](#a-20-minute-guide-to-mmaction2-framework)
+  - [Step0: Prepare Data](#step0-prepare-data)
+  - [Step1: Build a Pipeline](#step1-build-a-pipeline)
+  - [Step2: Build a Dataset and DataLoader](#step2-build-a-dataset-and-dataloader)
+  - [Step3: Build a Recognizer](#step3-build-a-recognizer)
+  - [Step4: Build a Evaluation Metric](#step4-build-a-evaluation-metric)
+  - [Step5: Train and Test with Native PyTorch](#step5-train-and-test-with-native-pytorch)
+  - [Step6: Train and Test with MMEngine (Recommended)](#step6-train-and-test-with-mmengine-recommended)
+
+First, we need to initialize the `scope` for registry, to ensure that each module is registered under the scope of `mmaction`. For more detailed information about registry, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html).
+
+```python
+from mmaction.utils import register_all_modules
+
+register_all_modules(init_default_scope=True)
+```
+
+## Step0: Prepare Data
+
+Please download our self-made [kinetics400_tiny](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) dataset and extract it to the `$MMACTION2/data` directory.
+The directory structure after extraction should be as follows:
+
+```
+mmaction2
+├── data
+│   ├── kinetics400_tiny
+│   │    ├── kinetics_tiny_train_video.txt
+│   │    ├── kinetics_tiny_val_video.txt
+│   │    ├── train
+│   │    │   ├── 27_CSXByd3s.mp4
+│   │    │   ├── 34XczvTaRiI.mp4
+│   │    │   ├── A-wiliK50Zw.mp4
+│   │    │   ├── ...
+│   │    └── val
+│   │       ├── 0pVGiAU6XEA.mp4
+│   │       ├── AQrbRSnRt8M.mp4
+│   │       ├── ...
+```
+
+Here are some examples from the annotation file `kinetics_tiny_train_video.txt`:
+
+```
+D32_1gwq35E.mp4 0
+iRuyZSKhHRg.mp4 1
+oXy-e_P_cAI.mp4 0
+34XczvTaRiI.mp4 1
+h2YqqUhnR34.mp4 0
+```
+
+Each line in the file represents the annotation of a video, where the first item denotes the video filename (e.g., `D32_1gwq35E.mp4`), and the second item represents the corresponding label (e.g., label `0` for `D32_1gwq35E.mp4`). In this dataset, there are only `two` categories.
+
+## Step1: Build a Pipeline
+
+In order to `decode`, `sample`, `resize`, `crop`, `format`, and `pack` the input video and corresponding annotation, we need to design a pipeline to handle these processes. Specifically, we design seven `Transform` classes to build this video processing pipeline. Note that all `Transform` classes in OpenMMLab must inherit from the `BaseTransform` class in `mmcv`, implement the abstract method `transform`, and be registered to the `TRANSFORMS` registry. For more detailed information about data transform, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_transform.html).
+
+```python
+import mmcv
+import decord
+import numpy as np
+from mmcv.transforms import TRANSFORMS, BaseTransform, to_tensor
+from mmaction.structures import ActionDataSample
+
+
+@TRANSFORMS.register_module()
+class VideoInit(BaseTransform):
+    def transform(self, results):
+        container = decord.VideoReader(results['filename'])
+        results['total_frames'] = len(container)
+        results['video_reader'] = container
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoSample(BaseTransform):
+    def __init__(self, clip_len, num_clips, test_mode=False):
+        self.clip_len = clip_len
+        self.num_clips = num_clips
+        self.test_mode = test_mode
+
+    def transform(self, results):
+        total_frames = results['total_frames']
+        interval = total_frames // self.clip_len
+
+        if self.test_mode:
+            # Make the sampling during testing deterministic
+            np.random.seed(42)
+
+        inds_of_all_clips = []
+        for i in range(self.num_clips):
+            bids = np.arange(self.clip_len) * interval
+            offset = np.random.randint(interval, size=bids.shape)
+            inds = bids + offset
+            inds_of_all_clips.append(inds)
+
+        results['frame_inds'] = np.concatenate(inds_of_all_clips)
+        results['clip_len'] = self.clip_len
+        results['num_clips'] = self.num_clips
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoDecode(BaseTransform):
+    def transform(self, results):
+        frame_inds = results['frame_inds']
+        container = results['video_reader']
+
+        imgs = container.get_batch(frame_inds).asnumpy()
+        imgs = list(imgs)
+
+        results['video_reader'] = None
+        del container
+
+        results['imgs'] = imgs
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoResize(BaseTransform):
+    def __init__(self, r_size):
+        self.r_size = (np.inf, r_size)
+
+    def transform(self, results):
+        img_h, img_w = results['img_shape']
+        new_w, new_h = mmcv.rescale_size((img_w, img_h), self.r_size)
+
+        imgs = [mmcv.imresize(img, (new_w, new_h))
+                for img in results['imgs']]
+        results['imgs'] = imgs
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoCrop(BaseTransform):
+    def __init__(self, c_size):
+        self.c_size = c_size
+
+    def transform(self, results):
+        img_h, img_w = results['img_shape']
+        center_x, center_y = img_w // 2, img_h // 2
+        x1, x2 = center_x - self.c_size // 2, center_x + self.c_size // 2
+        y1, y2 = center_y - self.c_size // 2, center_y + self.c_size // 2
+        imgs = [img[y1:y2, x1:x2] for img in results['imgs']]
+        results['imgs'] = imgs
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoFormat(BaseTransform):
+    def transform(self, results):
+        num_clips = results['num_clips']
+        clip_len = results['clip_len']
+        imgs = results['imgs']
+
+        # [num_clips*clip_len, H, W, C]
+        imgs = np.array(imgs)
+        # [num_clips, clip_len, H, W, C]
+        imgs = imgs.reshape((num_clips, clip_len) + imgs.shape[1:])
+        # [num_clips, C, clip_len, H, W]
+        imgs = imgs.transpose(0, 4, 1, 2, 3)
+
+        results['imgs'] = imgs
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoPack(BaseTransform):
+    def __init__(self, meta_keys=('img_shape', 'num_clips', 'clip_len')):
+        self.meta_keys = meta_keys
+
+    def transform(self, results):
+        packed_results = dict()
+        inputs = to_tensor(results['imgs'])
+        data_sample = ActionDataSample()
+        data_sample.set_gt_label(results['label'])
+        metainfo = {k: results[k] for k in self.meta_keys if k in results}
+        data_sample.set_metainfo(metainfo)
+        packed_results['inputs'] = inputs
+        packed_results['data_samples'] = data_sample
+        return packed_results
+```
+
+Below, we provide a code snippet (using `D32_1gwq35E.mp4 0` from the annotation file) to demonstrate how to use the pipeline.
+
+```python
+import os.path as osp
+from mmengine.dataset import Compose
+
+pipeline_cfg = [
+    dict(type='VideoInit'),
+    dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False),
+    dict(type='VideoDecode'),
+    dict(type='VideoResize', r_size=256),
+    dict(type='VideoCrop', c_size=224),
+    dict(type='VideoFormat'),
+    dict(type='VideoPack')
+]
+
+pipeline = Compose(pipeline_cfg)
+data_prefix = 'data/kinetics400_tiny/train'
+results = dict(filename=osp.join(data_prefix, 'D32_1gwq35E.mp4'), label=0)
+packed_results = pipeline(results)
+
+inputs = packed_results['inputs']
+data_sample = packed_results['data_samples']
+
+print('shape of the inputs: ', inputs.shape)
+
+# Get metainfo of the inputs
+print('image_shape: ', data_sample.img_shape)
+print('num_clips: ', data_sample.num_clips)
+print('clip_len: ', data_sample.clip_len)
+
+# Get label of the inputs
+print('label: ', data_sample.gt_label)
+```
+
+```
+shape of the inputs:  torch.Size([1, 3, 16, 224, 224])
+image_shape:  (224, 224)
+num_clips:  1
+clip_len:  16
+label:  tensor([0])
+```
+
+## Step2: Build a Dataset and DataLoader
+
+All `Dataset` classes in OpenMMLab must inherit from the `BaseDataset` class in `mmengine`. We can customize annotation loading process by overriding the `load_data_list` method. Additionally, we can add more information to the `results` dict that is passed as input to the `pipeline` by overriding the `get_data_info` method. For more detailed information about `BaseDataset` class, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html).
+
+```python
+import os.path as osp
+from mmengine.fileio import list_from_file
+from mmengine.dataset import BaseDataset
+from mmaction.registry import DATASETS
+
+
+@DATASETS.register_module()
+class DatasetZelda(BaseDataset):
+    def __init__(self, ann_file, pipeline, data_root, data_prefix=dict(video=''),
+                 test_mode=False, modality='RGB', **kwargs):
+        self.modality = modality
+        super(DatasetZelda, self).__init__(ann_file=ann_file, pipeline=pipeline, data_root=data_root,
+                                           data_prefix=data_prefix, test_mode=test_mode,
+                                           **kwargs)
+
+    def load_data_list(self):
+        data_list = []
+        fin = list_from_file(self.ann_file)
+        for line in fin:
+            line_split = line.strip().split()
+            filename, label = line_split
+            label = int(label)
+            filename = osp.join(self.data_prefix['video'], filename)
+            data_list.append(dict(filename=filename, label=label))
+        return data_list
+
+    def get_data_info(self, idx: int) -> dict:
+        data_info = super().get_data_info(idx)
+        data_info['modality'] = self.modality
+        return data_info
+```
+
+Next, we will demonstrate how to use dataset and dataloader to index data. We will use the `Runner.build_dataloader` method to construct the dataloader. For more detailed information about dataloader, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/dataset.html#details-on-dataloader).
+
+```python
+from mmaction.registry import DATASETS
+
+train_pipeline_cfg = [
+    dict(type='VideoInit'),
+    dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False),
+    dict(type='VideoDecode'),
+    dict(type='VideoResize', r_size=256),
+    dict(type='VideoCrop', c_size=224),
+    dict(type='VideoFormat'),
+    dict(type='VideoPack')
+]
+
+val_pipeline_cfg = [
+    dict(type='VideoInit'),
+    dict(type='VideoSample', clip_len=16, num_clips=5, test_mode=True),
+    dict(type='VideoDecode'),
+    dict(type='VideoResize', r_size=256),
+    dict(type='VideoCrop', c_size=224),
+    dict(type='VideoFormat'),
+    dict(type='VideoPack')
+]
+
+train_dataset_cfg = dict(
+    type='DatasetZelda',
+    ann_file='kinetics_tiny_train_video.txt',
+    pipeline=train_pipeline_cfg,
+    data_root='data/kinetics400_tiny/',
+    data_prefix=dict(video='train'))
+
+val_dataset_cfg = dict(
+    type='DatasetZelda',
+    ann_file='kinetics_tiny_val_video.txt',
+    pipeline=val_pipeline_cfg,
+    data_root='data/kinetics400_tiny/',
+    data_prefix=dict(video='val'))
+
+train_dataset = DATASETS.build(train_dataset_cfg)
+
+packed_results = train_dataset[0]
+
+inputs = packed_results['inputs']
+data_sample = packed_results['data_samples']
+
+print('shape of the inputs: ', inputs.shape)
+
+# Get metainfo of the inputs
+print('image_shape: ', data_sample.img_shape)
+print('num_clips: ', data_sample.num_clips)
+print('clip_len: ', data_sample.clip_len)
+
+# Get label of the inputs
+print('label: ', data_sample.gt_label)
+
+from mmengine.runner import Runner
+
+BATCH_SIZE = 2
+
+train_dataloader_cfg = dict(
+    batch_size=BATCH_SIZE,
+    num_workers=0,
+    persistent_workers=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=train_dataset_cfg)
+
+val_dataloader_cfg = dict(
+    batch_size=BATCH_SIZE,
+    num_workers=0,
+    persistent_workers=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=val_dataset_cfg)
+
+train_data_loader = Runner.build_dataloader(dataloader=train_dataloader_cfg)
+val_data_loader = Runner.build_dataloader(dataloader=val_dataloader_cfg)
+
+batched_packed_results = next(iter(train_data_loader))
+
+batched_inputs = batched_packed_results['inputs']
+batched_data_sample = batched_packed_results['data_samples']
+
+assert len(batched_inputs) == BATCH_SIZE
+assert len(batched_data_sample) == BATCH_SIZE
+```
+
+The terminal output should be the same as the one shown in the [Step1: Build a Pipeline](#step1-build-a-pipeline).
+
+## Step3: Build a Recognizer
+
+Next, we will construct the `recognizer`, which mainly consists of three parts: `data preprocessor` for batching and normalizing the data, `backbone` for feature extraction, and `cls_head` for classification.
+
+The implementation of `data_preprocessor` is as follows:
+
+```python
+import torch
+from mmengine.model import BaseDataPreprocessor, stack_batch
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class DataPreprocessorZelda(BaseDataPreprocessor):
+    def __init__(self, mean, std):
+        super().__init__()
+
+        self.register_buffer(
+            'mean',
+            torch.tensor(mean, dtype=torch.float32).view(-1, 1, 1, 1),
+            False)
+        self.register_buffer(
+            'std',
+            torch.tensor(std, dtype=torch.float32).view(-1, 1, 1, 1),
+            False)
+
+    def forward(self, data, training=False):
+        data = self.cast_data(data)
+        inputs = data['inputs']
+        batch_inputs = stack_batch(inputs)  # Batching
+        batch_inputs = (batch_inputs - self.mean) / self.std  # Normalization
+        data['inputs'] = batch_inputs
+        return data
+```
+
+Here is the usage of data_preprocessor: feed the `batched_packed_results` obtained from the [Step2: Build a Dataset and DataLoader](#step2-build-a-dataset-and-dataloader) into the `data_preprocessor` for batching and normalization.
+
+```python
+from mmaction.registry import MODELS
+
+data_preprocessor_cfg = dict(
+    type='DataPreprocessorZelda',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375])
+
+data_preprocessor = MODELS.build(data_preprocessor_cfg)
+
+preprocessed_inputs = data_preprocessor(batched_packed_results)
+print(preprocessed_inputs['inputs'].shape)
+```
+
+```
+torch.Size([2, 1, 3, 16, 224, 224])
+```
+
+The implementations of `backbone`, `cls_head` and `recognizer` are as follows:
+
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModel, BaseModule, Sequential
+from mmengine.structures import LabelData
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class BackBoneZelda(BaseModule):
+    def __init__(self, init_cfg=None):
+        if init_cfg is None:
+            init_cfg = [dict(type='Kaiming', layer='Conv3d', mode='fan_out', nonlinearity="relu"),
+                        dict(type='Constant', layer='BatchNorm3d', val=1, bias=0)]
+
+        super(BackBoneZelda, self).__init__(init_cfg=init_cfg)
+
+        self.conv1 = Sequential(nn.Conv3d(3, 64, kernel_size=(3, 7, 7),
+                                          stride=(1, 2, 2), padding=(1, 3, 3)),
+                                nn.BatchNorm3d(64), nn.ReLU())
+        self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2),
+                                    padding=(0, 1, 1))
+
+        self.conv = Sequential(nn.Conv3d(64, 128, kernel_size=3, stride=2, padding=1),
+                               nn.BatchNorm3d(128), nn.ReLU())
+
+    def forward(self, imgs):
+        # imgs: [batch_size*num_views, 3, T, H, W]
+        # features: [batch_size*num_views, 128, T/2, H//8, W//8]
+        features = self.conv(self.maxpool(self.conv1(imgs)))
+        return features
+
+
+@MODELS.register_module()
+class ClsHeadZelda(BaseModule):
+    def __init__(self, num_classes, in_channels, dropout=0.5, average_clips='prob', init_cfg=None):
+        if init_cfg is None:
+            init_cfg = dict(type='Normal', layer='Linear', std=0.01)
+
+        super(ClsHeadZelda, self).__init__(init_cfg=init_cfg)
+
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.average_clips = average_clips
+
+        if dropout != 0:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+        self.fc = nn.Linear(self.in_channels, self.num_classes)
+        self.pool = nn.AdaptiveAvgPool3d(1)
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, x):
+        N, C, T, H, W = x.shape
+        x = self.pool(x)
+        x = x.view(N, C)
+        assert x.shape[1] == self.in_channels
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        cls_scores = self.fc(x)
+        return cls_scores
+
+    def loss(self, feats, data_samples):
+        cls_scores = self(feats)
+        labels = torch.stack([x.gt_label for x in data_samples])
+        labels = labels.squeeze()
+
+        if labels.shape == torch.Size([]):
+            labels = labels.unsqueeze(0)
+
+        loss_cls = self.loss_fn(cls_scores, labels)
+        return dict(loss_cls=loss_cls)
+
+    def predict(self, feats, data_samples):
+        cls_scores = self(feats)
+        num_views = cls_scores.shape[0] // len(data_samples)
+        # assert num_views == data_samples[0].num_clips
+        cls_scores = self.average_clip(cls_scores, num_views)
+
+        for ds, sc in zip(data_samples, cls_scores):
+            pred = LabelData(item=sc)
+            ds.pred_scores = pred
+        return data_samples
+
+    def average_clip(self, cls_scores, num_views):
+          if self.average_clips not in ['score', 'prob', None]:
+            raise ValueError(f'{self.average_clips} is not supported. '
+                             f'Currently supported ones are '
+                             f'["score", "prob", None]')
+
+          total_views = cls_scores.shape[0]
+          cls_scores = cls_scores.view(total_views // num_views, num_views, -1)
+
+          if self.average_clips is None:
+              return cls_scores
+          elif self.average_clips == 'prob':
+              cls_scores = F.softmax(cls_scores, dim=2).mean(dim=1)
+          elif self.average_clips == 'score':
+              cls_scores = cls_scores.mean(dim=1)
+
+          return cls_scores
+
+
+@MODELS.register_module()
+class RecognizerZelda(BaseModel):
+    def __init__(self, backbone, cls_head, data_preprocessor):
+        super().__init__(data_preprocessor=data_preprocessor)
+
+        self.backbone = MODELS.build(backbone)
+        self.cls_head = MODELS.build(cls_head)
+
+    def extract_feat(self, inputs):
+        inputs = inputs.view((-1, ) + inputs.shape[2:])
+        return self.backbone(inputs)
+
+    def loss(self, inputs, data_samples):
+        feats = self.extract_feat(inputs)
+        loss = self.cls_head.loss(feats, data_samples)
+        return loss
+
+    def predict(self, inputs, data_samples):
+        feats = self.extract_feat(inputs)
+        predictions = self.cls_head.predict(feats, data_samples)
+        return predictions
+
+    def forward(self, inputs, data_samples=None, mode='tensor'):
+        if mode == 'tensor':
+            return self.extract_feat(inputs)
+        elif mode == 'loss':
+            return self.loss(inputs, data_samples)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples)
+        else:
+            raise RuntimeError(f'Invalid mode: {mode}')
+```
+
+The `init_cfg` is used for model weight initialization. For more information on model weight initialization, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/initialize.html). The usage of the above modules is as follows:
+
+```python
+import torch
+import copy
+from mmaction.registry import MODELS
+
+model_cfg = dict(
+    type='RecognizerZelda',
+    backbone=dict(type='BackBoneZelda'),
+    cls_head=dict(
+        type='ClsHeadZelda',
+        num_classes=2,
+        in_channels=128,
+        average_clips='prob'),
+    data_preprocessor = dict(
+        type='DataPreprocessorZelda',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]))
+
+model = MODELS.build(model_cfg)
+
+# Train
+model.train()
+model.init_weights()
+data_batch_train = copy.deepcopy(batched_packed_results)
+data = model.data_preprocessor(data_batch_train, training=True)
+loss = model(**data, mode='loss')
+print('loss dict: ', loss)
+
+# Test
+with torch.no_grad():
+    model.eval()
+    data_batch_test = copy.deepcopy(batched_packed_results)
+    data = model.data_preprocessor(data_batch_test, training=False)
+    predictions = model(**data, mode='predict')
+print('Label of Sample[0]', predictions[0].gt_label)
+print('Scores of Sample[0]', predictions[0].pred_score)
+```
+
+```shell
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.0.weight - torch.Size([64, 3, 3, 7, 7]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.0.bias - torch.Size([64]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.1.weight - torch.Size([64]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.1.bias - torch.Size([64]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.0.weight - torch.Size([128, 64, 3, 3, 3]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.0.bias - torch.Size([128]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.1.weight - torch.Size([128]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.1.bias - torch.Size([128]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+cls_head.fc.weight - torch.Size([2, 128]):
+NormalInit: mean=0, std=0.01, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+cls_head.fc.bias - torch.Size([2]):
+NormalInit: mean=0, std=0.01, bias=0
+
+loss dict:  {'loss_cls': tensor(0.6853, grad_fn=<NllLossBackward0>)}
+Label of Sample[0] tensor([0])
+Scores of Sample[0] tensor([0.5240, 0.4760])
+```
+
+## Step4: Build a Evaluation Metric
+
+Note that all `Metric` classes in `OpenMMLab` must inherit from the `BaseMetric` class in `mmengine` and  implement the abstract methods, `process` and `compute_metrics`. For more information on evaluation, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html).
+
+```python
+import copy
+from collections import OrderedDict
+from mmengine.evaluator import BaseMetric
+from mmaction.evaluation import top_k_accuracy
+from mmaction.registry import METRICS
+
+
+@METRICS.register_module()
+class AccuracyMetric(BaseMetric):
+    def __init__(self, topk=(1, 5), collect_device='cpu', prefix='acc'):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.topk = topk
+
+    def process(self, data_batch, data_samples):
+        data_samples = copy.deepcopy(data_samples)
+        for data_sample in data_samples:
+            result = dict()
+            scores = data_sample['pred_score'].cpu().numpy()
+            label = data_sample['gt_label'].item()
+            result['scores'] = scores
+            result['label'] = label
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        eval_results = OrderedDict()
+        labels = [res['label'] for res in results]
+        scores = [res['scores'] for res in results]
+        topk_acc = top_k_accuracy(scores, labels, self.topk)
+        for k, acc in zip(self.topk, topk_acc):
+            eval_results[f'topk{k}'] = acc
+        return eval_results
+```
+
+```python
+from mmaction.registry import METRICS
+
+metric_cfg = dict(type='AccuracyMetric', topk=(1, 5))
+
+metric = METRICS.build(metric_cfg)
+
+data_samples = [d.to_dict() for d in predictions]
+
+metric.process(batched_packed_results, data_samples)
+acc = metric.compute_metrics(metric.results)
+print(acc)
+```
+
+```shell
+OrderedDict([('topk1', 0.5), ('topk5', 1.0)])
+```
+
+## Step5: Train and Test with Native PyTorch
+
+```python
+import torch.optim as optim
+from mmengine import track_iter_progress
+
+
+device = 'cuda' # or 'cpu'
+max_epochs = 10
+
+optimizer = optim.Adam(model.parameters(), lr=0.01)
+
+for epoch in range(max_epochs):
+    model.train()
+    losses = []
+    for data_batch in track_iter_progress(train_data_loader):
+        data = model.data_preprocessor(data_batch, training=True)
+        loss_dict = model(**data, mode='loss')
+        loss = loss_dict['loss_cls']
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        losses.append(loss.item())
+
+    print(f'Epoch[{epoch}]: loss ', sum(losses) / len(train_data_loader))
+
+    with torch.no_grad():
+        model.eval()
+        for data_batch in track_iter_progress(val_data_loader):
+            data = model.data_preprocessor(data_batch, training=False)
+            predictions = model(**data, mode='predict')
+            data_samples = [d.to_dict() for d in predictions]
+            metric.process(data_batch, data_samples)
+
+        acc = metric.acc = metric.compute_metrics(metric.results)
+        for name, topk in acc.items():
+            print(f'{name}: ', topk)
+```
+
+## Step6: Train and Test with MMEngine (Recommended)
+
+For more details on training and testing, you can refer to [MMAction2 Tutorial](https://mmaction2.readthedocs.io/en/latest/user_guides/train_test.html). For more information on `Runner`, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html).
+
+```python
+from mmengine.runner import Runner
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1)
+val_cfg = dict(type='ValLoop')
+
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=0.01))
+
+runner = Runner(model=model_cfg, work_dir='./work_dirs/guide',
+                train_dataloader=train_dataloader_cfg,
+                train_cfg=train_cfg,
+                val_dataloader=val_dataloader_cfg,
+                val_cfg=val_cfg,
+                optim_wrapper=optim_wrapper,
+                val_evaluator=[metric_cfg],
+                default_scope='mmaction')
+runner.train()
+```
diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md
new file mode 100644
index 0000000000000000000000000000000000000000..dc2153c1e68b695f63451a218d17d9300133c0f7
--- /dev/null
+++ b/docs/en/get_started/installation.md
@@ -0,0 +1,209 @@
+# Installation
+
+## Prerequisites
+
+In this section we demonstrate how to prepare an environment with PyTorch.
+
+MMAction2 works on Linux, Windows and macOS. It requires Python 3.7+, CUDA 10.2+ and PyTorch 1.8+.
+
+```{note}
+If you are experienced with PyTorch and have already installed it, just skip this part and jump to the [next section](#installation). Otherwise, you can follow these steps for the preparation.
+```
+
+**Step 1.** Download and install Miniconda from the [official website](https://docs.conda.io/en/latest/miniconda.html).
+
+**Step 2.** Create a conda environment and activate it.
+
+```shell
+conda create --name openmmlab python=3.8 -y
+conda activate openmmlab
+```
+
+**Step 3.** Install PyTorch following [official instructions](https://pytorch.org/get-started/locally/), e.g.
+
+On GPU platforms:
+
+```shell
+conda install pytorch torchvision -c pytorch
+```
+
+```{warning}
+This command will automatically install the latest version PyTorch and cudatoolkit, please check whether they match your environment.
+```
+
+On CPU platforms:
+
+```shell
+conda install pytorch torchvision cpuonly -c pytorch
+```
+
+## Best Practices
+
+We recommend that users follow our best practices to install MMAction2. However, the whole process is highly customizable. See [Customize Installation](#customize-installation) section for more information.
+
+**Step 1.** Install [MMEngine](https://github.com/open-mmlab/mmengine), [MMCV](https://github.com/open-mmlab/mmcv), [MMDetection](https://github.com/open-mmlab/mmdetection) (optional) and [MMPose](https://github.com/open-mmlab/mmpose) (optional) using [MIM](https://github.com/open-mmlab/mim).
+
+```shell
+pip install -U openmim
+mim install mmengine
+mim install mmcv
+mim install mmdet
+mim install mmpose
+```
+
+**Step 2.** Install MMAction2.
+
+According to your needs, we support two install modes:
+
+- [Install from source (Recommended)](#build-mmaction2-from-source): You want to develop your own action recognition task or new features on MMAction2 framework. For example, adding new dataset or new models. Thus, you can use all tools we provided.
+- [Install as a Python package](#install-as-a-python-package): You just want to call MMAction2's APIs or import MMAction2's modules in your project.
+
+### Build MMAction2 from source
+
+In this case, install mmaction2 from source:
+
+```shell
+git clone https://github.com/open-mmlab/mmaction2.git
+cd mmaction2
+pip install -v -e .
+# "-v" means verbose, or more output
+# "-e" means installing a project in editable mode,
+# thus any local modifications made to the code will take effect without re-installation.
+```
+
+Optionally, if you want to contribute to MMAction2 or experience experimental functions, please checkout to the `dev-1.x` branch:
+
+```shell
+git checkout dev-1.x
+```
+
+### Install as a Python package
+
+Just install with pip.
+
+```shell
+pip install mmaction2
+```
+
+## Verify the installation
+
+To verify whether MMAction2 is installed correctly, we provide some sample codes to run an inference demo.
+
+**Step 1.** Download the config and checkpoint files.
+
+```shell
+mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb --dest .
+```
+
+**Step 2.** Verify the inference demo.
+
+Option (a). If you install mmaction2 from source, you can run the following command:
+
+```shell
+# The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+python demo/demo.py tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py \
+    tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth \
+    demo/demo.mp4 tools/data/kinetics/label_map_k400.txt
+```
+
+You will see the top-5 labels with corresponding scores in your terminal.
+
+Option (b). If you install mmaction2 as a python package, you can run the following codes in your python interpreter, which will do the similar verification:
+
+```python
+from operator import itemgetter
+from mmaction.apis import init_recognizer, inference_recognizer
+
+config_file = 'tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py'
+checkpoint_file = 'tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth'
+video_file = 'demo/demo.mp4'
+label_file = 'tools/data/kinetics/label_map_k400.txt'
+model = init_recognizer(config_file, checkpoint_file, device='cpu')  # or device='cuda:0'
+pred_result = inference_recognizer(model, video_file)
+
+pred_scores = pred_result.pred_score.tolist()
+score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))
+score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)
+top5_label = score_sorted[:5]
+
+labels = open(label_file).readlines()
+labels = [x.strip() for x in labels]
+results = [(labels[k[0]], k[1]) for k in top5_label]
+
+print('The top-5 labels with corresponding scores are:')
+for result in results:
+    print(f'{result[0]}: ', result[1])
+```
+
+## Customize Installation
+
+### CUDA versions
+
+When installing PyTorch, you may need to specify the version of CUDA. If you are
+not clear on which to choose, follow our recommendations:
+
+- For Ampere-based NVIDIA GPUs, such as GeForce 30 series and NVIDIA A100, CUDA 11 is a must.
+- For older NVIDIA GPUs, CUDA 11 is backward compatible, but CUDA 10.2 offers better compatibility and is more lightweight.
+
+Please make sure the GPU driver satisfies the minimum version requirements. See [this table](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions) for more information.
+
+```{note}
+Installing CUDA runtime libraries is enough if you follow our best practices,
+because no CUDA code will be compiled locally. However if you hope to compile
+MMCV from source or develop other CUDA operators, you need to install the
+complete CUDA toolkit from NVIDIA's [website](https://developer.nvidia.com/cuda-downloads),
+and its version should match the CUDA version of PyTorch. i.e., the specified
+version of cudatoolkit in `conda install` command.
+```
+
+### Install MMCV without MIM
+
+MMCV contains C++ and CUDA extensions, so it depends on PyTorch in a complex
+way. MIM solves such dependencies automatically and makes the installation
+easier. However, it is not a must.
+
+To install MMCV with pip instead of MIM, please follow
+[MMCV installation guides](https://mmcv.readthedocs.io/en/latest/get_started/installation.html).
+This requires manually specifying a find-url based on PyTorch version and its CUDA version.
+
+For example, the following command install mmcv built for PyTorch 1.10.x and CUDA 11.3.
+
+```shell
+pip install mmcv -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
+```
+
+### Install on CPU-only platforms
+
+MMAction2 can be built for CPU-only environment. In CPU mode you can train, test or inference a model.
+
+Some functionalities are gone in this mode, usually GPU-compiled ops. But don't
+worry, almost all models in MMAction2 don't depend on these ops.
+
+### Using MMAction2 with Docker
+
+We provide a [Dockerfile](https://github.com/open-mmlab/mmaction2/blob/main/docker/Dockerfile)
+to build an image. Ensure that your [docker version](https://docs.docker.com/engine/install/) >=19.03.
+
+```shell
+# build an image with PyTorch 1.8.1, CUDA 10.2, CUDNN 7.
+# If you prefer other versions, just modified the Dockerfile
+docker build -f ./docker/Dockerfile --rm -t mmaction2 .
+```
+
+Run it with
+
+```shell
+docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmaction2/data mmaction2
+```
+
+## Troubleshooting
+
+1. When migrating from the old version `0.x` to the new version `1.x`, you may encounter issues with mismatched versions of dependent libraries. Below is a display of the versions of each dependent library after following the aforementioned installation process, as shown by `pip list` command. Please ensure that the versions of each dependent library displayed in your terminal are greater than or equal to (i.e., `>=`) the versions shown below for each dependent library.
+
+```shell
+mmaction2                1.0.0
+mmcv                     2.0.0
+mmdet                    3.0.0
+mmengine                 0.7.2
+mmpose                   1.0.0
+```
diff --git a/docs/en/get_started/overview.md b/docs/en/get_started/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..64f0d523eb4c886282bccc5196cf7c696a806f43
--- /dev/null
+++ b/docs/en/get_started/overview.md
@@ -0,0 +1,97 @@
+# Overview
+
+## What is MMAction2
+
+MMAction2 is an open source toolkit based on PyTorch, supporting numerous video understanding models, including **action recognition, skeleton-based action recognition, spatio-temporal action detection and temporal action localization**. Moreover, it supports widely-used academic datasets and offers many useful tools, assisting users in exploring various aspects of models and datasets, as well as implementing high-quality algorithms. Generally, the toolkit boasts the following features:
+
+**One-stop, Multi-model**: MMAction2 supports various video understanding tasks and implements state-of-the-art models for action recognition, localization, detection.
+
+**Modular Design**: The modular design of MMAction2 enables users to define and reuse modules in the model as required.
+
+**Various Useful Tools**: MMAction2 provides an array of analysis tools, such as visualizers, validation scripts, evaluators, etc., to aid users in troubleshooting, fine-tuning, or comparing models.
+
+**Powered by OpenMMLab**: Similar to other algorithm libraries in the OpenMMLab family, MMAction2 adheres to OpenMMLab's rigorous development guidelines and interface conventions, considerably reducing the learning cost for users familiar with other OpenMMLab projects. Furthermore, due to the unified interfaces among OpenMMLab projects, it is easy to call models implemented in other OpenMMLab projects (such as MMClassification) in MMAction2, which greatly facilitates cross-domain research and real-world applications.
+
+<table><tr>
+  <td><img src="https://github.com/open-mmlab/mmaction2/raw/main/resources/mmaction2_overview.gif" width="380px">
+    <p style="text-align: center;">Action Recognition</p></td>
+  <td><img src="https://user-images.githubusercontent.com/34324155/123989146-2ecae680-d9fb-11eb-916b-b9db5563a9e5.gif" width="380px"><br>
+    <p style="text-align: center;">Skeleton-based Action Recognition</p></td>
+</table></tr>
+<table><tr>
+  <td><img src="https://user-images.githubusercontent.com/30782254/155710881-bb26863e-fcb4-458e-b0c4-33cd79f96901.gif" width="380px">
+    <p style="text-align: center;">Spatio-Temporal Action Detection</p></td>
+  <td><img src="https://github.com/open-mmlab/mmaction2/raw/main/resources/spatio-temporal-det.gif" width="380px"><br>
+    <p style="text-align: center;">Spatio-Temporal Action Detection</p></td>
+</table></tr>
+
+## How to use the documentation
+
+We have prepared a wealth of documents to meet your various needs:
+
+<details open>
+<summary><b>For the basic usage of MMAction2</b></summary>
+
+- [Installation](installation.md)
+- [Quick Run](quick_run.md)
+- [Inference with existing models](../user_guides/inference.md)
+
+</details>
+
+<details open>
+<summary><b>For training on supported dataset</b></summary>
+
+- [Learn about Configs](../user_guides/config.md)
+- [Prepare Dataset](../user_guides/prepare_dataset.md)
+- [Training and Test](../user_guides/train_test.md)
+
+</details>
+
+<details open>
+<summary><b>For looking for some common issues</b></summary>
+
+- [FAQ](faq.md)
+- [Useful tools](../useful_tools.md)
+
+</details>
+
+<details open>
+<summary><b>For a general understanding about MMAction2</b></summary>
+
+- [A 20-Minute Guide to MMAction2 FrameWork](guide_to_framework.md)
+- [Dataflow in MMAction2](../advanced_guides/dataflow.md)
+
+</details>
+
+<details open>
+<summary><b>For advanced usage about custom training</b></summary>
+
+- [Customize Model](../advanced_guides/customize_models.md)
+- [Customize Dataset](../advanced_guides/customize_dataset.md)
+- [Customize Data Pipeline](../advanced_guides/customize_pipeline.md)
+- [Customize Optimizer](../advanced_guides/customize_optimizer.md)
+- [Customize Logging](../advanced_guides/customize_logging.md)
+
+</details>
+
+<details open>
+<summary><b>For supported model zoo and dataset zoo</b></summary>
+
+- [Model Zoo](../modelzoo_statistics.md)
+- [Dataset Zoo](../datasetzoo_statistics.md)
+
+</details>
+
+<details open>
+<summary><b>For migration from MMAction2 0.x</b></summary>
+
+- [Migration](../migration.md)
+
+</details>
+
+<details open>
+<summary><b>For researchers and developers who are willing to contribute to MMAction2</b></summary>
+
+- [How to contribute to MMAction2](contribution_guide.md)
+
+</details>
diff --git a/docs/en/get_started/quick_run.md b/docs/en/get_started/quick_run.md
new file mode 100644
index 0000000000000000000000000000000000000000..63c7f2429eaf8afff17ddde530110d07a079ec40
--- /dev/null
+++ b/docs/en/get_started/quick_run.md
@@ -0,0 +1,219 @@
+# Quick Run
+
+This chapter will introduce you to the fundamental functionalities of MMAction2. We assume that you have [installed MMAction2 from source](installation.md#best-practices).
+
+- [Quick Run](#quick-run)
+  - [Inference](#inference)
+  - [Prepare a Dataset](#prepare-a-dataset)
+  - [Modify the Config](#modify-the-config)
+    - [Modify Dataset](#modify-dataset)
+    - [Modify Runtime Config](#modify-runtime-config)
+    - [Modify Model Config](#modify-model-config)
+  - [Browse the Dataset](#browse-the-dataset)
+  - [Training](#training)
+  - [Testing](#testing)
+
+## Inference
+
+Run the following command in the root directory of MMAction2:
+
+```shell
+python demo/demo_inferencer.py  demo/demo.mp4 \
+    --rec tsn --print-result \
+    --label-file tools/data/kinetics/label_map_k400.txt
+```
+
+You should be able to see a pop-up video and the inference result printed out in the console.
+
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/33249023/227216933-29b84ac7-ca0e-408d-b4d2-5a2e5a7357bf.gif" height="250"/>
+</div>
+<br />
+
+```bash
+# Inference result
+{'predictions': [{'rec_labels': [[6]], 'rec_scores': [[...]]}]}
+```
+
+```{note}
+If you are running MMAction2 on a server without a GUI or via an SSH tunnel with X11 forwarding disabled, you may not see the pop-up window.
+```
+
+A detailed description of MMAction2's inference interface can be found [here](https://github.com/open-mmlab/mmaction2/tree/main/demo/README.md#inferencer).
+
+In addition to using our well-provided pre-trained models, you can also train models on your own datasets. In the next section, we will take you through the basic functions of MMAction2 by training TSN on the tiny [Kinetics](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) dataset as an example.
+
+## Prepare a Dataset
+
+Since the variety of video dataset formats are not conducive to switching datasets, MMAction2 proposes a uniform [data format](../user_guides/2_data_prepare.md), and provides [dataset preparer](../user_guides/data_prepare/dataset_preparer.md) for commonly used video datasets. Usually, to use those datasets in MMAction2, you just need to follow the steps to get them ready for use.
+
+```{note}
+But here, efficiency means everything.
+```
+
+To get started, please download our pre-prepared [kinetics400_tiny.zip](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) and extract it to the `data/` directory in the root directory of MMAction2. This will provide you with the necessary videos and annotation file.
+
+```Bash
+wget https://download.openmmlab.com/mmaction/kinetics400_tiny.zip
+mkdir -p data/
+unzip kinetics400_tiny.zip -d data/
+```
+
+## Modify the Config
+
+After preparing the dataset, the next step is to modify the config file to specify the location of the training set and training parameters.
+
+In this example, we will train a TSN using resnet50 as its backbone. Since MMAction2 already has a config file for the full Kinetics400 dataset (`configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`), we just need to make some modifications on top of it.
+
+### Modify Dataset
+
+We first need to modify the path to the dataset. Open `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` and replace keys as followed:
+
+```Python
+data_root = 'data/kinetics400_tiny/train'
+data_root_val = 'data/kinetics400_tiny/val'
+ann_file_train = 'data/kinetics400_tiny/kinetics_tiny_train_video.txt'
+ann_file_val = 'data/kinetics400_tiny/kinetics_tiny_val_video.txt'
+```
+
+### Modify Runtime Config
+
+Additionally, due to the reduced size of the dataset, we recommend decreasing the training batch size to 4 and the number of training epochs to 10 accordingly. Furthermore, we suggest shortening the validation and weight storage intervals to 1 round each, and modifying the learning rate decay strategy. Modify the corresponding keys in  `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` as following lines to take effect.
+
+```python
+# set training batch size to 4
+train_dataloader['batch_size'] = 4
+
+# Save checkpoints every epoch, and only keep the latest checkpoint
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1))
+# Set the maximum number of epochs to 10, and validate the model every 1 epochs
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1)
+# adjust learning rate schedule according to 10 epochs
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=10,
+        by_epoch=True,
+        milestones=[4, 8],
+        gamma=0.1)
+]
+```
+
+### Modify Model Config
+
+Further, due to the small size of tiny Kinetics dataset, it is recommended to load a pre-trained model on the original Kinetics dataset. Additionally, the model needs to be modified according to the actual number of classes. Please directly add the following lines to `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`.
+
+```python
+model = dict(
+    cls_head=dict(num_classes=2))
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'
+```
+
+Here, we have rewritten the corresponding parameters in the base configuration directly through the inheritance ({external+mmengine:doc}`MMEngine: Config <advanced_tutorials/config>`) mechanism of the config. The original fields are distributed in `configs/_base_/models/tsn_r50.py`, `configs/_base_/schedules/sgd_100e.py` and `configs/_base_/default_runtime.py`.
+
+```{note}
+For a more detailed description of config, please refer to [here](../user_guides/1_config.md).
+```
+
+## Browse the Dataset
+
+Before we start the training, we can also visualize the frames processed by training-time data transforms. It's quite simple: pass the config file we need to visualize into the [browse_dataset.py](https://github.com/open-mmlab/mmaction2/tree/main/tools/analysis_tools/browse_dataset.py) script.
+
+```Bash
+python tools/visualizations/browse_dataset.py \
+    configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \
+    browse_out --mode pipeline
+```
+
+The transformed videos will be saved to `browse_out` folder.
+
+<center class="half">
+    <img src="https://user-images.githubusercontent.com/33249023/227452030-81895695-8a9b-45be-922a-3d9d86baf65d.gif" height="250"/>
+</center>
+
+```{note}
+For details on the parameters and usage of this script, please refer to [here](../user_guides/useful_tools.md).
+```
+
+```{tip}
+In addition to satisfying our curiosity, visualization can also help us check the parts that may affect the model's performance before training, such as problems in configs, datasets and data transforms.
+```
+
+we can further visualize the learning rate schedule to make sure that the config is as expected by following script:
+
+```Bash
+python tools/visualizations/vis_scheduler.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+The training learning rate schedule will be displayed in a pop-up window.
+
+<center class="half">
+    <img src="https://user-images.githubusercontent.com/33249023/227502329-6fd44259-e23b-46e0-8e19-29f9b664f4e2.png" height="250"/>
+</center>
+
+```{note}
+The learning rate is auto scaled according to the actual batchsize.
+```
+
+## Training
+
+Start the training by running the following command:
+
+```Bash
+python tools/train.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+Depending on the system environment, MMAction2 will automatically use the best device for training. If a GPU is available, a single GPU training will be started by default. When you start to see the output of the losses, you have successfully started the training.
+
+```Bash
+03/24 16:36:15 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:15 - mmengine - INFO - Epoch(train)  [1][8/8]  lr: 1.5625e-04  eta: 0:00:15  time: 0.2151  data_time: 0.0845  memory: 1314  grad_norm: 8.5647  loss: 0.7267  top1_acc: 0.0000  top5_acc: 1.0000  loss_cls: 0.7267
+03/24 16:36:16 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:16 - mmengine - INFO - Epoch(train)  [2][8/8]  lr: 1.5625e-04  eta: 0:00:12  time: 0.1979  data_time: 0.0717  memory: 1314  grad_norm: 8.4709  loss: 0.7130  top1_acc: 0.0000  top5_acc: 1.0000  loss_cls: 0.7130
+03/24 16:36:18 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:18 - mmengine - INFO - Epoch(train)  [3][8/8]  lr: 1.5625e-04  eta: 0:00:10  time: 0.1691  data_time: 0.0478  memory: 1314  grad_norm: 8.2910  loss: 0.6900  top1_acc: 0.5000  top5_acc: 1.0000  loss_cls: 0.6900
+03/24 16:36:18 - mmengine - INFO - Saving checkpoint at 3 epochs
+03/24 16:36:19 - mmengine - INFO - Epoch(val) [3][1/1]  acc/top1: 0.9000  acc/top5: 1.0000  acc/mean1: 0.9000data_time: 1.2716  time: 1.3658
+03/24 16:36:20 - mmengine - INFO - The best checkpoint with 0.9000 acc/top1 at 3 epoch is saved to best_acc/top1_epoch_3.pth.
+```
+
+Without extra configurations, model weights will be saved to `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/`, while the logs will be stored in `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/TIMESTAMP/`. Next, we just need to wait with some patience for training to finish.
+
+```{note}
+For advanced usage of training, such as CPU training, multi-GPU training, and cluster training, please refer to [Training and Testing](../user_guides/train_test.md).
+```
+
+## Testing
+
+After 10 epochs, we observe that TSN performs best in the 6th epoch, with `acc/top1` reaching 1.0000:
+
+```Bash
+03/24 16:36:25 - mmengine - INFO - Epoch(val) [6][1/1]  acc/top1: 1.0000  acc/top5: 1.0000  acc/mean1: 1.0000data_time: 1.0210  time: 1.1091
+```
+
+```{note}
+The result is pretty high due to pre-trained on original Kinetics400, you may see a different result.
+```
+
+However, this value only reflects the validation performance of TSN on the mini Kinetics dataset, While test results are usually higher due to more augmentation in test pipeline.
+
+Start testing:
+
+```Bash
+python tools/test.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \
+    work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/best_acc/top1_epoch_6.pth
+```
+
+And get the outputs like:
+
+```Bash
+03/24 17:00:59 - mmengine - INFO - Epoch(test) [10/10]  acc/top1: 1.0000  acc/top5: 1.0000  acc/mean1: 0.9000data_time: 0.0420  time: 1.0795
+```
+
+The model achieves an top1-accuracy of 1.0000 on this dataset.
+
+```{note}
+For advanced usage of testing, such as CPU testing, multi-GPU testing, and cluster testing, please refer to [Training and Testing](../user_guides/train_test.md).
+```
diff --git a/docs/en/index.rst b/docs/en/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ad136df520d82eedb6e257bc996738c3285b7c5f
--- /dev/null
+++ b/docs/en/index.rst
@@ -0,0 +1,96 @@
+Welcome to MMAction2's documentation!
+=====================================
+
+You can switch between Chinese and English documents in the lower-left corner of the layout.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Get Started
+
+   get_started/overview.md
+   get_started/installation.md
+   get_started/quick_run.md
+   get_started/guide_to_framework.md
+   get_started/contribution_guide.md
+   get_started/faq.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: User Guides
+
+   user_guides/inference.md
+   user_guides/config.md
+   user_guides/train_test.md
+   user_guides/prepare_dataset.md
+   user_guides/finetune.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Advanced Guides
+
+   advanced_guides/dataflow.md
+   advanced_guides/customize_models.md
+   advanced_guides/customize_dataset.md
+   advanced_guides/customize_pipeline.md
+   advanced_guides/customize_optimizer.md
+   advanced_guides/customize_logging.md
+   advanced_guides/deploy.md
+   useful_tools.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Model Zoo
+
+   modelzoo_statistics.md
+   model_zoo/recognition.md
+   model_zoo/recognition_audio.md
+   model_zoo/skeleton.md
+   model_zoo/detection.md
+   model_zoo/retrieval.md
+   model_zoo/localization.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Dataset Zoo
+   :glob:
+
+   datasetzoo_statistics.md
+   dataset_zoo/*
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Projects
+
+   projectzoo.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Migration
+
+   migration.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: API Reference
+
+   api.rst
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Notes
+
+   notes/ecosystem.md
+   notes/changelog.md
+
+.. toctree::
+   :caption: Switch Language
+
+   switch_language.md
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/en/make.bat b/docs/en/make.bat
new file mode 100644
index 0000000000000000000000000000000000000000..2119f51099bf37e4fdb6071dce9f451ea44c62dd
--- /dev/null
+++ b/docs/en/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/en/migration.md b/docs/en/migration.md
new file mode 100644
index 0000000000000000000000000000000000000000..ea1935169dda03121d5fb2a8ffb2687cdd504e8c
--- /dev/null
+++ b/docs/en/migration.md
@@ -0,0 +1,498 @@
+# Migration from MMAction2 0.x
+
+MMAction2 1.x introduced major refactorings and modifications including some BC-breaking changes. We provide this tutorial to help you migrate your projects from MMAction2 0.x smoothly.
+
+## New dependencies
+
+MMAction2 1.x depends on the following packages. You are recommended to prepare a new clean environment and install them according to [install tutorial](./get_started/installation.md)
+
+1. [MMEngine](https://github.com/open-mmlab/mmengine): MMEngine is a foundational library for training deep learning model introduced in OpenMMLab 2.0 architecture.
+2. [MMCV](https://github.com/open-mmlab/mmcv): MMCV is a foundational library for computer vision. MMAction2 1.x requires `mmcv>=2.0.0` which is more compact and efficient than `mmcv-full==2.0.0`.
+
+## Configuration files
+
+In MMAction2 1.x, we refactored the structure of configuration files. The configuration files with the old style will be incompatible.
+
+In this section, we will introduce all changes of the configuration files. And we assume you are already familiar with the [config files](./user_guides/config.md).
+
+### Model settings
+
+No changes in `model.backbone` and `model.neck`. For `model.cls_head`, we move the `average_clips` inside it, which is originally set in `model.test_cfg`.
+
+### Data settings
+
+#### Changes in **`data`**
+
+- The original `data` field is splited to `train_dataloader`, `val_dataloader` and
+  `test_dataloader`. This allows us to configure them in fine-grained. For example,
+  you can specify different sampler and batch size during training and test.
+- The `videos_per_gpu` is renamed to `batch_size`.
+- The `workers_per_gpu` is renamed to `num_workers`.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+data = dict(
+    videos_per_gpu=32,
+    workers_per_gpu=2,
+    train=dict(...),
+    val=dict(...),
+    test=dict(...),
+)
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=2,
+    dataset=dict(...),
+    sampler=dict(type='DefaultSampler', shuffle=True)  # necessary
+)
+
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=2,
+    dataset=dict(...),
+    sampler=dict(type='DefaultSampler', shuffle=False)  # necessary
+)
+
+test_dataloader = val_dataloader
+```
+
+</td>
+</tr>
+</table>
+
+#### Changes in **`pipeline`**
+
+- The original formatting transforms **`ToTensor`**, **`Collect`** are combined as `PackActionInputs`.
+- We don't recommend to do **`Normalize`** in the dataset pipeline. Please remove it from pipelines and set it in the `model.data_preprocessor` field.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+model.data_preprocessor = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=5),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+```
+
+</td>
+</tr>
+</table>
+
+#### Changes in **`evaluation`**
+
+- The **`evaluation`** field is splited to `val_evaluator` and `test_evaluator`. And it won't support `interval` and `save_best` arguments.
+- The `interval` is moved to `train_cfg.val_interval` and the `save_best` is moved to `default_hooks.checkpoint.save_best`.
+- The 'mean_average_precision', 'mean_class_accuracy', 'mmit_mean_average_precision', 'top_k_accuracy' are combined as `AccMetric`, and you could use `metric_list` to specify which metric to calculate.
+- The `AVAMetric` is used to evaluate AVA Dataset.
+- The `ANetMetric` is used to evaluate ActivityNet Dataset.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+evaluation = dict(
+    interval=5,
+    metrics=['top_k_accuracy', 'mean_class_accuracy'])
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+val_evaluator = dict(
+    type='AccMetric',
+    metric_list=('top_k_accuracy', 'mean_class_accuracy'))
+test_evaluator = val_evaluator
+```
+
+</td>
+</tr>
+</table>
+
+### Schedule settings
+
+#### Changes in **`optimizer`** and **`optimizer_config`**
+
+- Now we use `optim_wrapper` field to configure the optimization process. And the
+  `optimizer` becomes a sub field of `optim_wrapper`.
+- `paramwise_cfg` is also a sub field of `optim_wrapper` parallel to `optimizer`.
+- `optimizer_config` is removed now, and all configurations of it are moved to `optim_wrapper`.
+- `grad_clip` is renamed to `clip_grad`.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+optimizer = dict(
+    type='AdamW',
+    lr=0.0015,
+    weight_decay=0.3,
+    paramwise_cfg = dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+    ))
+
+optimizer_config = dict(grad_clip=dict(max_norm=1.0))
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=0.0015, weight_decay=0.3),
+    paramwise_cfg = dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+    ),
+    clip_gard=dict(max_norm=1.0),
+)
+```
+
+</td>
+</tr>
+</table>
+
+#### Changes in **`lr_config`**
+
+- The `lr_config` field is removed and we use new `param_scheduler` to replace it.
+- The `warmup` related arguments are removed, since we use schedulers combination to implement this
+  functionality.
+
+The new schedulers combination mechanism is very flexible, and you can use it to design many kinds of learning
+rate / momentum curves.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+lr_config = dict(
+    policy='CosineAnnealing',
+    min_lr=0,
+    warmup='linear',
+    warmup_iters=5,
+    warmup_ratio=0.01,
+    warmup_by_epoch=True)
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+param_scheduler = [
+    # warmup
+    dict(
+        type='LinearLR',
+        start_factor=0.01,
+        by_epoch=True,
+        end=5,
+        # Update the learning rate after every iters.
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', by_epoch=True, begin=5),
+]
+```
+
+</td>
+</tr>
+</table>
+
+#### Changes in **`runner`**
+
+Most configuration in the original `runner` field is moved to `train_cfg`, `val_cfg` and `test_cfg`, which
+configure the loop in training, validation and test.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+runner = dict(type='EpochBasedRunner', max_epochs=100)
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+# The `val_interval` is the original `evaluation.interval`.
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')   # Use the default validation loop.
+test_cfg = dict(type='TestLoop')  # Use the default test loop.
+```
+
+</td>
+</tr>
+</table>
+
+In fact, in OpenMMLab 2.0, we introduced `Loop` to control the behaviors in training, validation and test. And
+the functionalities of `Runner` are also changed. You can find more details in the [MMEngine tutorials](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html).
+
+### Runtime settings
+
+#### Changes in **`checkpoint_config`** and **`log_config`**
+
+The `checkpoint_config` are moved to `default_hooks.checkpoint` and the `log_config` are moved to `default_hooks.logger`.
+And we move many hooks settings from the script code to the `default_hooks` field in the runtime configuration.
+
+```python
+default_hooks = dict(
+    # update runtime information, e.g. current iter and lr.
+    runtime_info=dict(type='RuntimeInfoHook'),
+
+    # record the time of every iterations.
+    timer=dict(type='IterTimerHook'),
+
+    # print log every 100 iterations.
+    logger=dict(type='LoggerHook', interval=100),
+
+    # enable the parameter scheduler.
+    param_scheduler=dict(type='ParamSchedulerHook'),
+
+    # save checkpoint per epoch, and automatically save the best checkpoint.
+    checkpoint=dict(type='CheckpointHook', interval=1, save_best='auto'),
+
+    # set sampler seed in distributed environment.
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+
+    # synchronize model buffers at the end of each epoch.
+    sync_buffers=dict(type='SyncBuffersHook')
+)
+```
+
+In addition, we splited the original logger to logger and visualizer. The logger is used to record
+information and the visualizer is used to show the logger in different backends, like terminal, TensorBoard
+and Wandb.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook'),
+    ])
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+default_hooks = dict(
+    ...
+    logger=dict(type='LoggerHook', interval=100),
+)
+
+visualizer = dict(
+    type='ActionVisualizer',
+    vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')],
+)
+```
+
+</td>
+</tr>
+</table>
+
+#### Changes in **`load_from`** and **`resume_from`**
+
+- The `resume_from` is removed. And we use `resume` and `load_from` to replace it.
+  - If `resume=True` and `load_from` is not None, resume training from the checkpoint in `load_from`.
+  - If `resume=True` and `load_from` is None, try to resume from the latest checkpoint in the work directory.
+  - If `resume=False` and `load_from` is not None, only load the checkpoint, not resume training.
+  - If `resume=False` and `load_from` is None, do not load nor resume.
+
+#### Changes in **`dist_params`**
+
+The `dist_params` field is a sub field of `env_cfg` now. And there are some new configurations in the `env_cfg`.
+
+```python
+env_cfg = dict(
+    # whether to enable cudnn benchmark
+    cudnn_benchmark=False,
+
+    # set multi process parameters
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+
+    # set distributed parameters
+    dist_cfg=dict(backend='nccl'),
+)
+```
+
+#### Changes in **`workflow`**
+
+`Workflow` related functionalities are removed.
+
+#### New field **`visualizer`**
+
+The visualizer is a new design in OpenMMLab 2.0 architecture. We use a visualizer instance in the runner to handle results & log visualization and save to different backends.
+
+```python
+visualizer = dict(
+    type='ActionVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        # Uncomment the below line to save the log and visualization results to TensorBoard.
+        # dict(type='TensorboardVisBackend')
+    ]
+)
+```
+
+#### New field **`default_scope`**
+
+The start point to search module for all registries. The `default_scope` in MMAction2 is `mmaction`. See [the registry tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/registry.html) for more details.
+
+## Packages
+
+### `mmaction.apis`
+
+The documentation can be found [here](mmaction.apis).
+
+|        Function        |                     Changes                     |
+| :--------------------: | :---------------------------------------------: |
+|   `init_recognizer`    |                   No changes                    |
+| `inference_recognizer` |                   No changes                    |
+|     `train_model`      |      Removed, use `runner.train` to train.      |
+|    `multi_gpu_test`    |       Removed, use `runner.test` to test.       |
+|   `single_gpu_test`    |       Removed, use `runner.test` to test.       |
+|   `set_random_seed`    | Removed, use `mmengine.runner.set_random_seed`. |
+|   `init_random_seed`   | Removed, use `mmengine.dist.sync_random_seed`.  |
+
+### `mmaction.core`
+
+The `mmaction.core` package is renamed to [`mmaction.engine`](mmaction.engine).
+
+| Sub package  |                                               Changes                                               |
+| :----------: | :-------------------------------------------------------------------------------------------------: |
+| `evaluation` |                         Removed, use the metrics in `mmaction.evaluation`.                          |
+|   `hooks`    |                                  Moved to `mmaction.engine.hooks`                                   |
+| `optimizer`  |                                Moved to `mmaction.engine.optimizers`                                |
+|   `utils`    | Removed, the distributed environment related functions can be found in the `mmengine.dist` package. |
+
+### `mmaction.datasets`
+
+The documentation can be found [here](mmaction.datasets)
+
+#### Changes in [`BaseActionDataset`](mmaction.datasets.BaseActionDataset):
+
+|         Method         |                    Changes                    |
+| :--------------------: | :-------------------------------------------: |
+| `prepare_train_frames` |          Replaced by `get_data_info`          |
+| `preprare_test_frames` |          Replaced by `get_data_info`          |
+|       `evaluate`       |  Removed, use `mmengine.evaluator.Evaluator`  |
+|     `dump_results`     | Removed, use `mmengine.evaluator.DumpResults` |
+|   `load_annotations`   |         Replaced by `load_data_list`          |
+
+Now, you can write a new Dataset class inherited from `BaseActionDataset` and overwrite `load_data_list` only. To load more data information, you could overwrite `get_data_info` like `RawframeDataset` and `AVADataset`.
+The `mmaction.datasets.pipelines` is renamed to `mmaction.datasets.transforms` and the `mmaction.datasets.pipelines.augmentations` is renamed to `mmaction.datasets.pipelines.processing`.
+
+### `mmaction.models`
+
+The documentation can be found [here](mmaction.models). The interface of all **backbones**, **necks** and **losses** didn't change.
+
+#### Changes in [`BaseRecognizer`](mmaction.models.BaseRecognizer):
+
+|     Method      |                                                                                Changes                                                                                 |
+| :-------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| `extract_feat`  | Enhanced method, which now supports output features of three stages (`backbone`, `neck`, `head`) and can handle different modes, such as `train_mode` and `test_mode`. |
+|    `forward`    |            Now only accepts three arguments: `inputs`, `data_samples` and `mode`. See [the documentation](mmaction.models.BaseRecognizer) for more details.            |
+| `forward_train` |                                                                          Replaced by `loss`.                                                                           |
+| `forward_test`  |                                                                         Replaced by `predict`.                                                                         |
+|  `train_step`   |                         The `optimizer` argument is replaced by `optim_wrapper` and it accepts [`OptimWrapper`](mmengine.optim.OptimWrapper).                          |
+|   `val_step`    |                                              The original `val_step` is the same as `train_step`, now it calls `predict`.                                              |
+|   `test_step`   |                                                              New method, and it's the same as `val_step`.                                                              |
+
+#### Changes in [BaseHead](mmaction.models.BaseHead):
+
+|  Method   |                                                                                        Changes                                                                                         |
+| :-------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| `forward` |                                                                                       No changes                                                                                       |
+|  `loss`   | It accepts `feats` and `data_samples` instead of `cls_score` and `labels` to calculate loss. The `data_samples` is a list of [ActionDataSample](mmaction.structures.ActionDataSample). |
+| `predict` |                                                  New method. It accepts `feats` and `data_samples` to predict classification scores.                                                   |
+
+### `mmaction.utils`
+
+|        Function         |                            Changes                            |
+| :---------------------: | :-----------------------------------------------------------: |
+|      `collect_env`      |                          No changes                           |
+|    `get_root_logger`    |     Removed, use `mmengine.MMLogger.get_current_instance`     |
+| `setup_multi_processes` | Removed, use `mmengine.utils.dl_utils.setup_multi_processes`. |
+
+### Other changes
+
+- We moved the definition of all registries in different packages to the `mmaction.registry` package.
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
new file mode 100644
index 0000000000000000000000000000000000000000..04acb16e91ee21086d2ba7dda4b3623c28a9ad83
--- /dev/null
+++ b/docs/en/notes/changelog.md
@@ -0,0 +1,1082 @@
+# Changelog
+
+## 1.2.0 (10/12/2023)
+
+**Highlights**
+
+- Support the Training of ActionClip
+- Support VindLU multi-modality algorithm
+- Support MobileOne TSN/TSM
+
+**New Features**
+
+- Support the Training of ActionClip ([2620](https://github.com/open-mmlab/mmaction2/pull/2620))
+- Support video retrieval dataset MSVD ([2622](https://github.com/open-mmlab/mmaction2/pull/2622))
+- Support VindLU multi-modality algorithm ([2667](https://github.com/open-mmlab/mmaction2/pull/2667))
+- Support Dense Regression Network for Video Grounding ([2668](https://github.com/open-mmlab/mmaction2/pull/2668))
+
+**Improvements**
+
+- Support Video Demos ([2602](https://github.com/open-mmlab/mmaction2/pull/2602))
+- Support Audio Demos ([2603](https://github.com/open-mmlab/mmaction2/pull/2603))
+- Add README_zh-CN.md for Swin and VideoMAE ([2621](https://github.com/open-mmlab/mmaction2/pull/2621))
+- Support MobileOne TSN/TSM ([2656](https://github.com/open-mmlab/mmaction2/pull/2656))
+- Support SlowOnly K700 feature to train localization models ([2673](https://github.com/open-mmlab/mmaction2/pull/2673))
+
+**Bug Fixes**
+
+- Refine ActionDataSample structure ([2658](https://github.com/open-mmlab/mmaction2/pull/2658))
+- Fix MPS device ([2619](https://github.com/open-mmlab/mmaction2/pull/2619))
+
+## 1.1.0 (7/3/2023)
+
+**Highlights**
+
+- Support HACS-segments dataset(ICCV'2019), MultiSports dataset(ICCV'2021), Kinetics-710 dataset(Arxiv'2022)
+- Support rich projects: gesture recognition, spatio-temporal action detection tutorial, and knowledge distillation
+- Support TCANet(CVPR'2021)
+- Support VideoMAE V2(CVPR'2023), and VideoMAE(NeurIPS'2022) on action detection
+- Support CLIP-based multi-modality models: ActionCLIP(Arxiv'2021) and CLIP4clip(ArXiv'2022)
+- Support [Pure Python style Configuration File](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) and downloading datasets by MIM
+
+**New Features**
+
+- Support HACS-segments dataset ([2224](https://github.com/open-mmlab/mmaction2/pull/2224))
+- Support TCANet ([2271](https://github.com/open-mmlab/mmaction2/pull/2271))
+- Support MultiSports dataset ([2280](https://github.com/open-mmlab/mmaction2/pull/2280))
+- Support spatio-temporal action detection tutorial ([2428](https://github.com/open-mmlab/mmaction2/pull/2428))
+- Support knowledge distillation based on MMRazor ([2458](https://github.com/open-mmlab/mmaction2/pull/2458))
+- Support VideoMAE V2 ([2460](https://github.com/open-mmlab/mmaction2/pull/2460))
+- Support ActionCLIP ([2470](https://github.com/open-mmlab/mmaction2/pull/2470))
+- Support CLIP4clip ([2489](https://github.com/open-mmlab/mmaction2/pull/2489))
+- Support Kinetics-710 dataset ([2534](https://github.com/open-mmlab/mmaction2/pull/2534))
+- Support gesture recognition project ([2539](https://github.com/open-mmlab/mmaction2/pull/2539))
+- Support VideoMAE on action detection ([2547](https://github.com/open-mmlab/mmaction2/pull/2547))
+- Support downloading datasets by MIM ([2465](https://github.com/open-mmlab/mmaction2/pull/1465))
+- Support new config ([2542](https://github.com/open-mmlab/mmaction2/pull/2542))
+
+**Improvements**
+
+- Refactor TSM init_weights ([2396](https://github.com/open-mmlab/mmaction2/pull/2396))
+- Add unit test for Recognizer 2D ([2432](https://github.com/open-mmlab/mmaction2/pull/2432))
+- Enhance inference APIs ([2472](https://github.com/open-mmlab/mmaction2/pull/2472))
+- Support converting ST-GCN and PoseC3D to ONNX ([2543](https://github.com/open-mmlab/mmaction2/pull/2543))
+- Support feature extraction head ([2525](https://github.com/open-mmlab/mmaction2/pull/2525))
+
+**Bug Fixes**
+
+- Fix CircleCI ([2351](https://github.com/open-mmlab/mmaction2/pull/2351))
+- Fix demo skeleton script ([2380](https://github.com/open-mmlab/mmaction2/pull/2380))
+- Fix docker file branch ([2397](https://github.com/open-mmlab/mmaction2/pull/2397))
+- Fix NTU pose extraction script ([2402](https://github.com/open-mmlab/mmaction2/pull/2402))
+- Rename typing and enhance collect_env script ([2420](https://github.com/open-mmlab/mmaction2/pull/2420))
+- Fix multi-label classification ([2425](https://github.com/open-mmlab/mmaction2/pull/2425), [2466](https://github.com/open-mmlab/mmaction2/pull/2466), [2532](https://github.com/open-mmlab/mmaction2/pull/2532))
+- Fix lfb configs ([2426](https://github.com/open-mmlab/mmaction2/pull/2426))
+- Fix a warning caused by `torch.div` ([2449](https://github.com/open-mmlab/mmaction2/pull/2449))
+- Fix incompatibility of ImgAug and latest Numpy ([2451](https://github.com/open-mmlab/mmaction2/pull/2451))
+- Fix MViT with_cls_token argument ([2480](https://github.com/open-mmlab/mmaction2/pull/2480))
+- Fix timm BC-breaking for TSN ([2497](https://github.com/open-mmlab/mmaction2/pull/2497))
+- Close FileHandler in Windows to make the temporary directory can be deleted  ([2565](https://github.com/open-mmlab/mmaction2/pull/2565))
+- Update minimum PyTorch version to 1.8.1 ([2568](https://github.com/open-mmlab/mmaction2/pull/2568))
+
+**Documentation**
+
+- Fix document links in README ([2358](https://github.com/open-mmlab/mmaction2/pull/2358), [2372](https://github.com/open-mmlab/mmaction2/pull/2372), [2376](https://github.com/open-mmlab/mmaction2/pull/2376), [2382](https://github.com/open-mmlab/mmaction2/pull/2382))
+- Update installation document ([2362](https://github.com/open-mmlab/mmaction2/pull/2362))
+- Update upstream library version requirement ([2383](https://github.com/open-mmlab/mmaction2/pull/2383))
+- Fix Colab tutorial ([2384](https://github.com/open-mmlab/mmaction2/pull/2384), [2391](https://github.com/open-mmlab/mmaction2/pull/2391), [2475](https://github.com/open-mmlab/mmaction2/pull/2475))
+- Refine documents ([2404](https://github.com/open-mmlab/mmaction2/pull/2404))
+- Update outdated config in readme ([2419](https://github.com/open-mmlab/mmaction2/pull/2419))
+- Update OpenMMLab related repo list ([2429](https://github.com/open-mmlab/mmaction2/pull/2429))
+- Fix UniFormer README and metafile ([2450](https://github.com/open-mmlab/mmaction2/pull/2450))
+- Add finetune document ([2457](https://github.com/open-mmlab/mmaction2/pull/2457))
+- Update FAQ document ([2476](https://github.com/open-mmlab/mmaction2/pull/2476), [2482](https://github.com/open-mmlab/mmaction2/pull/2482)
+- Update download datasets document ([2495](https://github.com/open-mmlab/mmaction2/pull/2495))
+- Translate Chinese document ([2516](https://github.com/open-mmlab/mmaction2/pull/2516), [2506](https://github.com/open-mmlab/mmaction2/pull/2506), [2499](https://github.com/open-mmlab/mmaction2/pull/2499))
+- Refactor model zoo and dataset zoo ([2552](https://github.com/open-mmlab/mmaction2/pull/2552))
+- Refactor Chinese document ([2567](https://github.com/open-mmlab/mmaction2/pull/2567))
+
+## 1.0.0 (4/6/2023)
+
+**Highlights**
+
+- Support RGB-PoseC3D(CVPR'2022).
+- Support training UniFormer V2(Arxiv'2022).
+- Support MSG3D(CVPR'2020) and CTRGCN(CVPR'2021) in projects.
+- Refactor and provide more user-friendly documentation.
+
+**New Features**
+
+- Support RGB-PoseC3D ([2182](https://github.com/open-mmlab/mmaction2/pull/2182))
+- Support training UniFormer V2 ([2221](https://github.com/open-mmlab/mmaction2/pull/2221))
+- Support MSG3D and CTRGCN in projects. ([2269](https://github.com/open-mmlab/mmaction2/pull/2269), [2291](https://github.com/open-mmlab/mmaction2/pull/2291))
+
+**Improvements**
+
+- Use MMEngine to calculate FLOPs ([2300](https://github.com/open-mmlab/mmaction2/pull/2300))
+- Speed up LFB training ([2294](https://github.com/open-mmlab/mmaction2/pull/2294))
+- Support multiprocessing on AVA evaluation ([2146](https://github.com/open-mmlab/mmaction2/pull/2146))
+- Add a demo for exporting spatial-temporal detection model to ONNX ([2225](https://github.com/open-mmlab/mmaction2/pull/2225))
+- Update spatial-temporal detection related folders ([2262](https://github.com/open-mmlab/mmaction2/pull/2262))
+
+**Bug Fixes**
+
+- Fix flip config of TSM for sth v1/v2 dataset ([#2247](https://github.com/open-mmlab/mmaction2/pull/2247))
+- Fix circle ci ([2336](https://github.com/open-mmlab/mmaction2/pull/2336), [2334](https://github.com/open-mmlab/mmaction2/pull/2334))
+- Fix accepting an unexpected argument local-rank in PyTorch 2.0 ([2320](https://github.com/open-mmlab/mmaction2/pull/2320))
+- Fix TSM config link ([2315](https://github.com/open-mmlab/mmaction2/pull/2315))
+- Fix numpy version requirement in CI ([2284](https://github.com/open-mmlab/mmaction2/pull/2284))
+- Fix NTU pose extraction script ([2246](https://github.com/open-mmlab/mmaction2/pull/2246))
+- Fix TSM-MobileNet V2 ([2332](https://github.com/open-mmlab/mmaction2/pull/2332))
+- Fix command bugs in localization tasks' README ([2244](https://github.com/open-mmlab/mmaction2/pull/2244))
+- Fix duplicate name in DecordInit and SampleAVAFrame ([2251](https://github.com/open-mmlab/mmaction2/pull/2251))
+- Fix channel order when showing video ([2308](https://github.com/open-mmlab/mmaction2/pull/2308))
+- Specify map_location to cpu when using \_load_checkpoint ([2252](https://github.com/open-mmlab/mmaction2/pull/2254))
+
+**Documentation**
+
+- Refactor and provide more user-friendly documentation ([2341](https://github.com/open-mmlab/mmaction2/pull/2341), [2312](https://github.com/open-mmlab/mmaction2/pull/2312), [2325](https://github.com/open-mmlab/mmaction2/pull/2325))
+- Add README_zh-CN ([2252](https://github.com/open-mmlab/mmaction2/pull/2252))
+- Add social networking links ([2294](https://github.com/open-mmlab/mmaction2/pull/2294))
+- Fix sthv2 dataset annotations preparation document ([2248](https://github.com/open-mmlab/mmaction2/pull/2248))
+
+## 1.0.0rc3 (2/10/2023)
+
+**Highlights**
+
+- Support Action Recognition model UniFormer V1(ICLR'2022), UniFormer V2(Arxiv'2022).
+- Support training MViT V2(CVPR'2022), and MaskFeat(CVPR'2022) fine-tuning.
+
+**New Features**
+
+- Support UniFormer V1/V2 ([#2153](https://github.com/open-mmlab/mmaction2/pull/2153))
+- Support training MViT, and MaskFeat fine-tuning ([#2186](https://github.com/open-mmlab/mmaction2/pull/2186))
+- Support a unified inference interface: Inferencer ([#2164](https://github.com/open-mmlab/mmaction2/pull/2164))
+
+**Improvements**
+
+- Support load data list from multi-backends ([#2176](https://github.com/open-mmlab/mmaction2/pull/2176))
+
+**Bug Fixes**
+
+- Upgrade isort to fix CI ([#2198](https://github.com/open-mmlab/mmaction2/pull/2198))
+- Fix bug in skeleton demo ([#2214](https://github.com/open-mmlab/mmaction2/pull/2214))
+
+**Documentation**
+
+- Add Chinese documentation for config.md ([#2188](https://github.com/open-mmlab/mmaction2/pull/2188))
+- Add readme for Omnisource ([#2205](https://github.com/open-mmlab/mmaction2/pull/2205))
+
+## 1.0.0rc2 (1/10/2023)
+
+**Highlights**
+
+- Support Action Recognition model VideoMAE(NeurIPS'2022), MViT V2(CVPR'2022), C2D and skeleton-based action recognition model STGCN++
+- Support Omni-Source training on ImageNet and Kinetics datasets
+- Support exporting spatial-temporal detection models to ONNX
+
+**New Features**
+
+- Support VideoMAE ([#1942](https://github.com/open-mmlab/mmaction2/pull/1942))
+- Support MViT V2 ([#2007](https://github.com/open-mmlab/mmaction2/pull/2007))
+- Support C2D ([#2022](https://github.com/open-mmlab/mmaction2/pull/2022))
+- Support AVA-Kinetics dataset ([#2080](https://github.com/open-mmlab/mmaction2/pull/2080))
+- Support STGCN++ ([#2156](https://github.com/open-mmlab/mmaction2/pull/2156))
+- Support exporting spatial-temporal detection models to ONNX ([#2148](https://github.com/open-mmlab/mmaction2/pull/2148))
+- Support Omni-Source training on ImageNet and Kinetics datasets ([#2143](https://github.com/open-mmlab/mmaction2/pull/2143))
+
+**Improvements**
+
+- Support repeat batch data augmentation ([#2170](https://github.com/open-mmlab/mmaction2/pull/2170))
+- Support calculating FLOPs tool powered by fvcore ([#1997](https://github.com/open-mmlab/mmaction2/pull/1997))
+- Support Spatial-temporal detection demo ([#2019](https://github.com/open-mmlab/mmaction2/pull/2019))
+- Add SyncBufferHook and add randomness config in train.py ([#2044](https://github.com/open-mmlab/mmaction2/pull/2044))
+- Refactor gradcam ([#2049](https://github.com/open-mmlab/mmaction2/pull/2049))
+- Support init_cfg in Swin and ViTMAE ([#2055](https://github.com/open-mmlab/mmaction2/pull/2055))
+- Refactor STGCN and related pipelines ([#2087](https://github.com/open-mmlab/mmaction2/pull/2087))
+- Refactor visualization tools ([#2092](https://github.com/open-mmlab/mmaction2/pull/2092))
+- Update `SampleFrames` transform and improve most models' performance ([#1942](https://github.com/open-mmlab/mmaction2/pull/1942))
+- Support real-time webcam demo ([#2152](https://github.com/open-mmlab/mmaction2/pull/2152))
+- Refactor and enhance 2s-AGCN ([#2130](https://github.com/open-mmlab/mmaction2/pull/2130))
+- Support adjusting fps in `SampleFrame` ([#2157](https://github.com/open-mmlab/mmaction2/pull/2157))
+
+**Bug Fixes**
+
+- Fix CI upstream library dependency ([#2000](https://github.com/open-mmlab/mmaction2/pull/2000))
+- Fix SlowOnly readme typos and results ([#2006](https://github.com/open-mmlab/mmaction2/pull/2006))
+- Fix VideoSwin readme ([#2010](https://github.com/open-mmlab/mmaction2/pull/2010))
+- Fix tools and mim error ([#2028](https://github.com/open-mmlab/mmaction2/pull/2028))
+- Fix Imgaug wrapper ([#2024](https://github.com/open-mmlab/mmaction2/pull/2024))
+- Remove useless scripts ([#2032](https://github.com/open-mmlab/mmaction2/pull/2032))
+- Fix multi-view inference ([#2045](https://github.com/open-mmlab/mmaction2/pull/2045))
+- Update mmcv maximum version to 1.8.0 ([#2047](https://github.com/open-mmlab/mmaction2/pull/2047))
+- Fix torchserver dependency ([#2053](https://github.com/open-mmlab/mmaction2/pull/2053))
+- Fix `gen_ntu_rgbd_raw` script ([#2076](https://github.com/open-mmlab/mmaction2/pull/2076))
+- Update AVA-Kinetics experiment configs and results ([#2099](https://github.com/open-mmlab/mmaction2/pull/2099))
+- Add `joint.pkl` and `bone.pkl` used in multi-stream fusion tool ([#2106](https://github.com/open-mmlab/mmaction2/pull/2106))
+- Fix lint CI config ([#2110](https://github.com/open-mmlab/mmaction2/pull/2110))
+- Update testing accuracy for modified `SampleFrames` ([#2117](https://github.com/open-mmlab/mmaction2/pull/2117)), ([#2121](https://github.com/open-mmlab/mmaction2/pull/2121)), ([#2122](https://github.com/open-mmlab/mmaction2/pull/2122)), ([#2124](https://github.com/open-mmlab/mmaction2/pull/2124)), ([#2125](https://github.com/open-mmlab/mmaction2/pull/2125)), ([#2126](https://github.com/open-mmlab/mmaction2/pull/2126)), ([#2129](https://github.com/open-mmlab/mmaction2/pull/2129)), ([#2128](https://github.com/open-mmlab/mmaction2/pull/2128))
+- Fix timm related bug ([#1976](https://github.com/open-mmlab/mmaction2/pull/1976))
+- Fix `check_videos.py` script ([#2134](https://github.com/open-mmlab/mmaction2/pull/2134))
+- Update CI maximum torch version to 1.13.0 ([#2118](https://github.com/open-mmlab/mmaction2/pull/2118))
+
+**Documentation**
+
+- Add MMYOLO description in README ([#2011](https://github.com/open-mmlab/mmaction2/pull/2011))
+- Add v1.x introduction in README ([#2023](https://github.com/open-mmlab/mmaction2/pull/2023))
+- Fix link in README ([#2035](https://github.com/open-mmlab/mmaction2/pull/2035))
+- Refine some docs ([#2038](https://github.com/open-mmlab/mmaction2/pull/2038)), ([#2040](https://github.com/open-mmlab/mmaction2/pull/2040)), ([#2058](https://github.com/open-mmlab/mmaction2/pull/2058))
+- Update TSN/TSM Readme ([#2082](https://github.com/open-mmlab/mmaction2/pull/2082))
+- Add chinese document ([#2083](https://github.com/open-mmlab/mmaction2/pull/2083))
+- Adjust document structure ([#2088](https://github.com/open-mmlab/mmaction2/pull/2088))
+- Fix Sth-Sth and Jester dataset links ([#2103](https://github.com/open-mmlab/mmaction2/pull/2103))
+- Fix doc link ([#2131](https://github.com/open-mmlab/mmaction2/pull/2131))
+
+## 1.0.0rc1 (10/14/2022)
+
+**Highlights**
+
+- Support Video Swin Transformer
+
+**New Features**
+
+- Support Video Swin Transformer ([#1939](https://github.com/open-mmlab/mmaction2/pull/1939))
+
+**Improvements**
+
+- Add colab tutorial for 1.x ([#1956](https://github.com/open-mmlab/mmaction2/pull/1956))
+- Support skeleton-based action recognition demo ([#1920](https://github.com/open-mmlab/mmaction2/pull/1920))
+
+**Bug Fixes**
+
+- Fix link in doc ([#1986](https://github.com/open-mmlab/mmaction2/pull/1986), [#1967](https://github.com/open-mmlab/mmaction2/pull/1967), [#1951](https://github.com/open-mmlab/mmaction2/pull/1951), [#1926](https://github.com/open-mmlab/mmaction2/pull/1926),[#1944](https://github.com/open-mmlab/mmaction2/pull/1944), [#1944](https://github.com/open-mmlab/mmaction2/pull/1944), [#1927](https://github.com/open-mmlab/mmaction2/pull/1927), [#1925](https://github.com/open-mmlab/mmaction2/pull/1925))
+- Fix CI ([#1987](https://github.com/open-mmlab/mmaction2/pull/1987), [#1930](https://github.com/open-mmlab/mmaction2/pull/1930), [#1923](https://github.com/open-mmlab/mmaction2/pull/1923))
+- Fix pre-commit hook config ([#1971](https://github.com/open-mmlab/mmaction2/pull/1971))
+- Fix TIN config ([#1912](https://github.com/open-mmlab/mmaction2/pull/1912))
+- Fix UT for BMN and BSN ([#1966](https://github.com/open-mmlab/mmaction2/pull/1966))
+- Fix UT for Recognizer2D ([#1937](https://github.com/open-mmlab/mmaction2/pull/1937))
+- Fix BSN and BMN configs for localization ([#1913](https://github.com/open-mmlab/mmaction2/pull/1913))
+- Modeify ST-GCN configs ([#1913](https://github.com/open-mmlab/mmaction2/pull/1914))
+- Fix typo in migration doc ([#1931](https://github.com/open-mmlab/mmaction2/pull/1931))
+- Remove Onnx related tools ([#1928](https://github.com/open-mmlab/mmaction2/pull/1928))
+- Update TANet readme ([#1916](https://github.com/open-mmlab/mmaction2/pull/1916), [#1890](https://github.com/open-mmlab/mmaction2/pull/1890))
+- Update 2S-AGCN readme ([#1915](https://github.com/open-mmlab/mmaction2/pull/1915))
+- Fix TSN configs ([#1905](https://github.com/open-mmlab/mmaction2/pull/1905))
+- Fix configs for detection ([#1903](https://github.com/open-mmlab/mmaction2/pull/1903))
+- Fix typo in TIN config ([#1904](https://github.com/open-mmlab/mmaction2/pull/1904))
+- Fix PoseC3D readme ([#1899](https://github.com/open-mmlab/mmaction2/pull/1899))
+- Fix ST-GCN configs ([#1891](https://github.com/open-mmlab/mmaction2/pull/1891))
+- Fix audio recognition readme ([#1898](https://github.com/open-mmlab/mmaction2/pull/1898))
+- Fix TSM readme ([#1887](https://github.com/open-mmlab/mmaction2/pull/1887))
+- Fix SlowOnly readme ([#1889](https://github.com/open-mmlab/mmaction2/pull/1889))
+- Fix TRN readme ([#1888](https://github.com/open-mmlab/mmaction2/pull/1888))
+- Fix typo in get_started doc ([#1895](https://github.com/open-mmlab/mmaction2/pull/1895))
+
+## 1.0.0rc0 (09/01/2022)
+
+We are excited to announce the release of MMAction2 v1.0.0rc0.
+MMAction2 1.0.0beta is the first version of MMAction2 1.x, a part of the OpenMMLab 2.0 projects.
+Built upon the new [training engine](https://github.com/open-mmlab/mmengine).
+
+**Highlights**
+
+- **New engines**. MMAction2 1.x is based on MMEngine\](https://github.com/open-mmlab/mmengine), which provides a general and powerful runner that allows more flexible customizations and significantly simplifies the entrypoints of high-level interfaces.
+
+- **Unified interfaces**. As a part of the OpenMMLab 2.0 projects, MMAction2 1.x unifies and refactors the interfaces and internal logics of train, testing, datasets, models, evaluation, and visualization. All the OpenMMLab 2.0 projects share the same design in those interfaces and logics to allow the emergence of multi-task/modality algorithms.
+
+- **More documentation and tutorials**. We add a bunch of documentation and tutorials to help users get started more smoothly. Read it [here](https://github.com/open-mmlab/mmaction2/blob/main/docs/en/migration.md).
+
+**Breaking Changes**
+
+In this release, we made lots of major refactoring and modifications. Please refer to the [migration guide](../migration.md) for details and migration instructions.
+
+## 0.24.0 (05/05/2022)
+
+**Highlights**
+
+- Support different seeds
+
+**New Features**
+
+- Add lateral norm in multigrid config ([#1567](https://github.com/open-mmlab/mmaction2/pull/1567))
+- Add openpose 25 joints in graph config ([#1578](https://github.com/open-mmlab/mmaction2/pull/1578))
+- Support MLU Backend ([#1608](https://github.com/open-mmlab/mmaction2/pull/1608))
+
+**Bug and Typo Fixes**
+
+- Fix local_rank ([#1558](https://github.com/open-mmlab/mmaction2/pull/1558))
+- Fix install typo ([#1571](https://github.com/open-mmlab/mmaction2/pull/1571))
+- Fix the inference API doc ([#1580](https://github.com/open-mmlab/mmaction2/pull/1580))
+- Fix zh-CN demo.md and getting_started.md ([#1587](https://github.com/open-mmlab/mmaction2/pull/1587))
+- Remove Recommonmark ([#1595](https://github.com/open-mmlab/mmaction2/pull/1595))
+- Fix inference with ndarray ([#1603](https://github.com/open-mmlab/mmaction2/pull/1603))
+- Fix the log error when `IterBasedRunner` is used ([#1606](https://github.com/open-mmlab/mmaction2/pull/1606))
+
+## 0.23.0 (04/01/2022)
+
+**Highlights**
+
+- Support different seeds
+- Provide multi-node training & testing script
+- Update error log
+
+**New Features**
+
+- Support different seeds([#1502](https://github.com/open-mmlab/mmaction2/pull/1502))
+- Provide multi-node training & testing script([#1521](https://github.com/open-mmlab/mmaction2/pull/1521))
+- Update error log([#1546](https://github.com/open-mmlab/mmaction2/pull/1546))
+
+**Documentations**
+
+- Update gpus in Slowfast readme([#1497](https://github.com/open-mmlab/mmaction2/pull/1497))
+- Fix work_dir in multigrid config([#1498](https://github.com/open-mmlab/mmaction2/pull/1498))
+- Add sub bn docs([#1503](https://github.com/open-mmlab/mmaction2/pull/1503))
+- Add shortcycle sampler docs([#1513](https://github.com/open-mmlab/mmaction2/pull/1513))
+- Update Windows Declaration([#1520](https://github.com/open-mmlab/mmaction2/pull/1520))
+- Update the link for ST-GCN([#1544](https://github.com/open-mmlab/mmaction2/pull/1544))
+- Update install commands([#1549](https://github.com/open-mmlab/mmaction2/pull/1549))
+
+**Bug and Typo Fixes**
+
+- Update colab tutorial install cmds([#1522](https://github.com/open-mmlab/mmaction2/pull/1522))
+- Fix num_iters_per_epoch in analyze_logs.py([#1530](https://github.com/open-mmlab/mmaction2/pull/1530))
+- Fix distributed_sampler([#1532](https://github.com/open-mmlab/mmaction2/pull/1532))
+- Fix cd dir error([#1545](https://github.com/open-mmlab/mmaction2/pull/1545))
+- Update arg names([#1548](https://github.com/open-mmlab/mmaction2/pull/1548))
+
+**ModelZoo**
+
+## 0.22.0 (03/05/2022)
+
+**Highlights**
+
+- Support Multigrid training strategy
+- Support CPU training
+- Support audio demo
+- Support topk customizing in models/heads/base.py
+
+**New Features**
+
+- Support Multigrid training strategy([#1378](https://github.com/open-mmlab/mmaction2/pull/1378))
+- Support STGCN in demo_skeleton.py([#1391](https://github.com/open-mmlab/mmaction2/pull/1391))
+- Support CPU training([#1407](https://github.com/open-mmlab/mmaction2/pull/1407))
+- Support audio demo([#1425](https://github.com/open-mmlab/mmaction2/pull/1425))
+- Support topk customizing in models/heads/base.py([#1452](https://github.com/open-mmlab/mmaction2/pull/1452))
+
+**Documentations**
+
+- Add OpenMMLab platform([#1393](https://github.com/open-mmlab/mmaction2/pull/1393))
+- Update links([#1394](https://github.com/open-mmlab/mmaction2/pull/1394))
+- Update readme in configs([#1404](https://github.com/open-mmlab/mmaction2/pull/1404))
+- Update instructions to install mmcv-full([#1426](https://github.com/open-mmlab/mmaction2/pull/1426))
+- Add shortcut([#1433](https://github.com/open-mmlab/mmaction2/pull/1433))
+- Update modelzoo([#1439](https://github.com/open-mmlab/mmaction2/pull/1439))
+- add video_structuralize in readme([#1455](https://github.com/open-mmlab/mmaction2/pull/1455))
+- Update OpenMMLab repo information([#1482](https://github.com/open-mmlab/mmaction2/pull/1482))
+
+**Bug and Typo Fixes**
+
+- Update train.py([#1375](https://github.com/open-mmlab/mmaction2/pull/1375))
+- Fix printout bug([#1382](<(https://github.com/open-mmlab/mmaction2/pull/1382)>))
+- Update multi processing setting([#1395](https://github.com/open-mmlab/mmaction2/pull/1395))
+- Setup multi processing both in train and test([#1405](https://github.com/open-mmlab/mmaction2/pull/1405))
+- Fix bug in nondistributed multi-gpu training([#1406](https://github.com/open-mmlab/mmaction2/pull/1406))
+- Add variable fps in  ava_dataset.py([#1409](https://github.com/open-mmlab/mmaction2/pull/1409))
+- Only support distributed training([#1414](https://github.com/open-mmlab/mmaction2/pull/1414))
+- Set test_mode for AVA configs([#1432](https://github.com/open-mmlab/mmaction2/pull/1432))
+- Support single label([#1434](https://github.com/open-mmlab/mmaction2/pull/1434))
+- Add check copyright([#1447](https://github.com/open-mmlab/mmaction2/pull/1447))
+- Support Windows CI([#1448](https://github.com/open-mmlab/mmaction2/pull/1448))
+- Fix wrong device of class_weight in models/losses/cross_entropy_loss.py([#1457](https://github.com/open-mmlab/mmaction2/pull/1457))
+- Fix bug caused by distributed([#1459](https://github.com/open-mmlab/mmaction2/pull/1459))
+- Update readme([#1460](https://github.com/open-mmlab/mmaction2/pull/1460))
+- Fix lint caused by colab automatic upload([#1461](https://github.com/open-mmlab/mmaction2/pull/1461))
+- Refine CI([#1471](https://github.com/open-mmlab/mmaction2/pull/1471))
+- Update pre-commit([#1474](https://github.com/open-mmlab/mmaction2/pull/1474))
+- Add deprecation message for deploy tool([#1483](https://github.com/open-mmlab/mmaction2/pull/1483))
+
+**ModelZoo**
+
+- Support slowfast_steplr([#1421](https://github.com/open-mmlab/mmaction2/pull/1421))
+
+## 0.21.0 (31/12/2021)
+
+**Highlights**
+
+- Support 2s-AGCN
+- Support publish models in Windows
+- Improve some sthv1 related models
+- Support BABEL
+
+**New Features**
+
+- Support 2s-AGCN([#1248](https://github.com/open-mmlab/mmaction2/pull/1248))
+- Support skip postproc in ntu_pose_extraction([#1295](https://github.com/open-mmlab/mmaction2/pull/1295))
+- Support publish models in Windows([#1325](https://github.com/open-mmlab/mmaction2/pull/1325))
+- Add copyright checkhook in pre-commit-config([#1344](https://github.com/open-mmlab/mmaction2/pull/1344))
+
+**Documentations**
+
+- Add MMFlow ([#1273](https://github.com/open-mmlab/mmaction2/pull/1273))
+- Revise README.md and add projects.md ([#1286](https://github.com/open-mmlab/mmaction2/pull/1286))
+- Add 2s-AGCN in Updates([#1289](https://github.com/open-mmlab/mmaction2/pull/1289))
+- Add MMFewShot([#1300](https://github.com/open-mmlab/mmaction2/pull/1300))
+- Add MMHuman3d([#1304](https://github.com/open-mmlab/mmaction2/pull/1304))
+- Update pre-commit([#1313](https://github.com/open-mmlab/mmaction2/pull/1313))
+- Use share menu from the theme instead([#1328](https://github.com/open-mmlab/mmaction2/pull/1328))
+- Update installation command([#1340](https://github.com/open-mmlab/mmaction2/pull/1340))
+
+**Bug and Typo Fixes**
+
+- Update the inference part in notebooks([#1256](https://github.com/open-mmlab/mmaction2/pull/1256))
+- Update the map_location([#1262](<(https://github.com/open-mmlab/mmaction2/pull/1262)>))
+- Fix bug that start_index is not used in RawFrameDecode([#1278](https://github.com/open-mmlab/mmaction2/pull/1278))
+- Fix bug in init_random_seed([#1282](https://github.com/open-mmlab/mmaction2/pull/1282))
+- Fix bug in setup.py([#1303](https://github.com/open-mmlab/mmaction2/pull/1303))
+- Fix interrogate error in workflows([#1305](https://github.com/open-mmlab/mmaction2/pull/1305))
+- Fix typo in slowfast config([#1309](https://github.com/open-mmlab/mmaction2/pull/1309))
+- Cancel previous runs that are not completed([#1327](https://github.com/open-mmlab/mmaction2/pull/1327))
+- Fix missing skip_postproc parameter([#1347](https://github.com/open-mmlab/mmaction2/pull/1347))
+- Update ssn.py([#1355](https://github.com/open-mmlab/mmaction2/pull/1355))
+- Use latest youtube-dl([#1357](https://github.com/open-mmlab/mmaction2/pull/1357))
+- Fix test-best([#1362](https://github.com/open-mmlab/mmaction2/pull/1362))
+
+**ModelZoo**
+
+- Improve some sthv1 related models([#1306](https://github.com/open-mmlab/mmaction2/pull/1306))
+- Support BABEL([#1332](https://github.com/open-mmlab/mmaction2/pull/1332))
+
+## 0.20.0 (07/10/2021)
+
+**Highlights**
+
+- Support TorchServe
+- Add video structuralize demo
+- Support using 3D skeletons for skeleton-based action recognition
+- Benchmark PoseC3D on UCF and HMDB
+
+**New Features**
+
+- Support TorchServe ([#1212](https://github.com/open-mmlab/mmaction2/pull/1212))
+- Support 3D skeletons pre-processing ([#1218](https://github.com/open-mmlab/mmaction2/pull/1218))
+- Support video structuralize demo ([#1197](https://github.com/open-mmlab/mmaction2/pull/1197))
+
+**Documentations**
+
+- Revise README.md and add projects.md ([#1214](https://github.com/open-mmlab/mmaction2/pull/1214))
+- Add CN docs for Skeleton dataset, PoseC3D and ST-GCN ([#1228](https://github.com/open-mmlab/mmaction2/pull/1228), [#1237](https://github.com/open-mmlab/mmaction2/pull/1237), [#1236](https://github.com/open-mmlab/mmaction2/pull/1236))
+- Add tutorial for custom dataset training for skeleton-based action recognition ([#1234](https://github.com/open-mmlab/mmaction2/pull/1234))
+
+**Bug and Typo Fixes**
+
+- Fix tutorial link ([#1219](https://github.com/open-mmlab/mmaction2/pull/1219))
+- Fix GYM links ([#1224](https://github.com/open-mmlab/mmaction2/pull/1224))
+
+**ModelZoo**
+
+- Benchmark PoseC3D on UCF and HMDB ([#1223](https://github.com/open-mmlab/mmaction2/pull/1223))
+- Add ST-GCN + 3D skeleton model for NTU60-XSub ([#1236](https://github.com/open-mmlab/mmaction2/pull/1236))
+
+## 0.19.0 (07/10/2021)
+
+**Highlights**
+
+- Support ST-GCN
+- Refactor the inference API
+- Add code spell check hook
+
+**New Features**
+
+- Support ST-GCN ([#1123](https://github.com/open-mmlab/mmaction2/pull/1123))
+
+**Improvement**
+
+- Add label maps for every dataset ([#1127](https://github.com/open-mmlab/mmaction2/pull/1127))
+- Remove useless code MultiGroupCrop ([#1180](https://github.com/open-mmlab/mmaction2/pull/1180))
+- Refactor Inference API ([#1191](https://github.com/open-mmlab/mmaction2/pull/1191))
+- Add code spell check hook ([#1208](https://github.com/open-mmlab/mmaction2/pull/1208))
+- Use docker in CI ([#1159](https://github.com/open-mmlab/mmaction2/pull/1159))
+
+**Documentations**
+
+- Update metafiles to new OpenMMLAB protocols ([#1134](https://github.com/open-mmlab/mmaction2/pull/1134))
+- Switch to new doc style ([#1160](https://github.com/open-mmlab/mmaction2/pull/1160))
+- Improve the ERROR message ([#1203](https://github.com/open-mmlab/mmaction2/pull/1203))
+- Fix invalid URL in getting_started ([#1169](https://github.com/open-mmlab/mmaction2/pull/1169))
+
+**Bug and Typo Fixes**
+
+- Compatible with new MMClassification ([#1139](https://github.com/open-mmlab/mmaction2/pull/1139))
+- Add missing runtime dependencies ([#1144](https://github.com/open-mmlab/mmaction2/pull/1144))
+- Fix THUMOS tag proposals path ([#1156](https://github.com/open-mmlab/mmaction2/pull/1156))
+- Fix LoadHVULabel ([#1194](https://github.com/open-mmlab/mmaction2/pull/1194))
+- Switch the default value of `persistent_workers` to False ([#1202](https://github.com/open-mmlab/mmaction2/pull/1202))
+- Fix `_freeze_stages` for MobileNetV2 ([#1193](https://github.com/open-mmlab/mmaction2/pull/1193))
+- Fix resume when building rawframes ([#1150](https://github.com/open-mmlab/mmaction2/pull/1150))
+- Fix device bug for class weight ([#1188](https://github.com/open-mmlab/mmaction2/pull/1188))
+- Correct Arg names in extract_audio.py ([#1148](https://github.com/open-mmlab/mmaction2/pull/1148))
+
+**ModelZoo**
+
+- Add TSM-MobileNetV2 ported from TSM ([#1163](https://github.com/open-mmlab/mmaction2/pull/1163))
+- Add ST-GCN for NTURGB+D-XSub-60 ([#1123](https://github.com/open-mmlab/mmaction2/pull/1123))
+
+## 0.18.0 (02/09/2021)
+
+**Improvement**
+
+- Add CopyRight ([#1099](https://github.com/open-mmlab/mmaction2/pull/1099))
+- Support NTU Pose Extraction ([#1076](https://github.com/open-mmlab/mmaction2/pull/1076))
+- Support Caching in RawFrameDecode ([#1078](https://github.com/open-mmlab/mmaction2/pull/1078))
+- Add citations & Support python3.9 CI & Use fixed-version sphinx ([#1125](https://github.com/open-mmlab/mmaction2/pull/1125))
+
+**Documentations**
+
+- Add Descriptions of PoseC3D dataset ([#1053](https://github.com/open-mmlab/mmaction2/pull/1053))
+
+**Bug and Typo Fixes**
+
+- Fix SSV2 checkpoints ([#1101](https://github.com/open-mmlab/mmaction2/pull/1101))
+- Fix CSN normalization ([#1116](https://github.com/open-mmlab/mmaction2/pull/1116))
+- Fix typo ([#1121](https://github.com/open-mmlab/mmaction2/pull/1121))
+- Fix new_crop_quadruple bug ([#1108](https://github.com/open-mmlab/mmaction2/pull/1108))
+
+## 0.17.0 (03/08/2021)
+
+**Highlights**
+
+- Support PyTorch 1.9
+- Support Pytorchvideo Transforms
+- Support PreciseBN
+
+**New Features**
+
+- Support Pytorchvideo Transforms ([#1008](https://github.com/open-mmlab/mmaction2/pull/1008))
+- Support PreciseBN ([#1038](https://github.com/open-mmlab/mmaction2/pull/1038))
+
+**Improvements**
+
+- Remove redundant augmentations in config files ([#996](https://github.com/open-mmlab/mmaction2/pull/996))
+- Make resource directory to hold common resource pictures ([#1011](https://github.com/open-mmlab/mmaction2/pull/1011))
+- Remove deprecated FrameSelector ([#1010](https://github.com/open-mmlab/mmaction2/pull/1010))
+- Support Concat Dataset ([#1000](https://github.com/open-mmlab/mmaction2/pull/1000))
+- Add `to-mp4` option to resize_videos.py ([#1021](https://github.com/open-mmlab/mmaction2/pull/1021))
+- Add option to keep tail frames ([#1050](https://github.com/open-mmlab/mmaction2/pull/1050))
+- Update MIM support ([#1061](https://github.com/open-mmlab/mmaction2/pull/1061))
+- Calculate Top-K accurate and inaccurate classes ([#1047](https://github.com/open-mmlab/mmaction2/pull/1047))
+
+**Bug and Typo Fixes**
+
+- Fix bug in PoseC3D demo ([#1009](https://github.com/open-mmlab/mmaction2/pull/1009))
+- Fix some problems in resize_videos.py ([#1012](https://github.com/open-mmlab/mmaction2/pull/1012))
+- Support torch1.9 ([#1015](https://github.com/open-mmlab/mmaction2/pull/1015))
+- Remove redundant code in CI ([#1046](https://github.com/open-mmlab/mmaction2/pull/1046))
+- Fix bug about persistent_workers ([#1044](https://github.com/open-mmlab/mmaction2/pull/1044))
+- Support TimeSformer feature extraction ([#1035](https://github.com/open-mmlab/mmaction2/pull/1035))
+- Fix ColorJitter ([#1025](https://github.com/open-mmlab/mmaction2/pull/1025))
+
+**ModelZoo**
+
+- Add TSM-R50 sthv1 models trained by PytorchVideo RandAugment and AugMix ([#1008](https://github.com/open-mmlab/mmaction2/pull/1008))
+- Update SlowOnly SthV1 checkpoints ([#1034](https://github.com/open-mmlab/mmaction2/pull/1034))
+- Add SlowOnly Kinetics400 checkpoints trained with Precise-BN ([#1038](https://github.com/open-mmlab/mmaction2/pull/1038))
+- Add CSN-R50 from scratch checkpoints ([#1045](https://github.com/open-mmlab/mmaction2/pull/1045))
+- TPN Kinetics-400 Checkpoints trained with the new ColorJitter ([#1025](https://github.com/open-mmlab/mmaction2/pull/1025))
+
+**Documentation**
+
+- Add Chinese translation of feature_extraction.md ([#1020](https://github.com/open-mmlab/mmaction2/pull/1020))
+- Fix the code snippet in getting_started.md ([#1023](https://github.com/open-mmlab/mmaction2/pull/1023))
+- Fix TANet config table ([#1028](https://github.com/open-mmlab/mmaction2/pull/1028))
+- Add description to PoseC3D dataset ([#1053](https://github.com/open-mmlab/mmaction2/pull/1053))
+
+## 0.16.0 (01/07/2021)
+
+**Highlights**
+
+- Support using backbone from pytorch-image-models(timm)
+- Support PIMS Decoder
+- Demo for skeleton-based action recognition
+- Support Timesformer
+
+**New Features**
+
+- Support using backbones from pytorch-image-models(timm) for TSN ([#880](https://github.com/open-mmlab/mmaction2/pull/880))
+- Support torchvision transformations in preprocessing pipelines ([#972](https://github.com/open-mmlab/mmaction2/pull/972))
+- Demo for skeleton-based action recognition ([#972](https://github.com/open-mmlab/mmaction2/pull/972))
+- Support Timesformer ([#839](https://github.com/open-mmlab/mmaction2/pull/839))
+
+**Improvements**
+
+- Add a tool to find invalid videos ([#907](https://github.com/open-mmlab/mmaction2/pull/907), [#950](https://github.com/open-mmlab/mmaction2/pull/950))
+- Add an option to specify spectrogram_type ([#909](https://github.com/open-mmlab/mmaction2/pull/909))
+- Add json output to video demo ([#906](https://github.com/open-mmlab/mmaction2/pull/906))
+- Add MIM related docs ([#918](https://github.com/open-mmlab/mmaction2/pull/918))
+- Rename lr to scheduler ([#916](https://github.com/open-mmlab/mmaction2/pull/916))
+- Support `--cfg-options` for demos ([#911](https://github.com/open-mmlab/mmaction2/pull/911))
+- Support number counting for flow-wise filename template ([#922](https://github.com/open-mmlab/mmaction2/pull/922))
+- Add Chinese tutorial ([#941](https://github.com/open-mmlab/mmaction2/pull/941))
+- Change ResNet3D default values ([#939](https://github.com/open-mmlab/mmaction2/pull/939))
+- Adjust script structure ([#935](https://github.com/open-mmlab/mmaction2/pull/935))
+- Add font color to args in long_video_demo ([#947](https://github.com/open-mmlab/mmaction2/pull/947))
+- Polish code style with Pylint ([#908](https://github.com/open-mmlab/mmaction2/pull/908))
+- Support PIMS Decoder ([#946](https://github.com/open-mmlab/mmaction2/pull/946))
+- Improve Metafiles ([#956](https://github.com/open-mmlab/mmaction2/pull/956), [#979](https://github.com/open-mmlab/mmaction2/pull/979), [#966](https://github.com/open-mmlab/mmaction2/pull/966))
+- Add links to download Kinetics400 validation ([#920](https://github.com/open-mmlab/mmaction2/pull/920))
+- Audit the usage of shutil.rmtree ([#943](https://github.com/open-mmlab/mmaction2/pull/943))
+- Polish localizer related codes([#913](https://github.com/open-mmlab/mmaction2/pull/913))
+
+**Bug and Typo Fixes**
+
+- Fix spatiotemporal detection demo ([#899](https://github.com/open-mmlab/mmaction2/pull/899))
+- Fix docstring for 3D inflate ([#925](https://github.com/open-mmlab/mmaction2/pull/925))
+- Fix bug of writing text to video with TextClip ([#952](https://github.com/open-mmlab/mmaction2/pull/952))
+- Fix mmcv install in CI ([#977](https://github.com/open-mmlab/mmaction2/pull/977))
+
+**ModelZoo**
+
+- Add TSN with Swin Transformer backbone as an example for using pytorch-image-models(timm) backbones ([#880](https://github.com/open-mmlab/mmaction2/pull/880))
+- Port CSN checkpoints from VMZ ([#945](https://github.com/open-mmlab/mmaction2/pull/945))
+- Release various checkpoints for UCF101, HMDB51 and Sthv1 ([#938](https://github.com/open-mmlab/mmaction2/pull/938))
+- Support Timesformer ([#839](https://github.com/open-mmlab/mmaction2/pull/839))
+- Update TSM modelzoo ([#981](https://github.com/open-mmlab/mmaction2/pull/981))
+
+## 0.15.0 (31/05/2021)
+
+**Highlights**
+
+- Support PoseC3D
+- Support ACRN
+- Support MIM
+
+**New Features**
+
+- Support PoseC3D ([#786](https://github.com/open-mmlab/mmaction2/pull/786), [#890](https://github.com/open-mmlab/mmaction2/pull/890))
+- Support MIM ([#870](https://github.com/open-mmlab/mmaction2/pull/870))
+- Support ACRN and Focal Loss ([#891](https://github.com/open-mmlab/mmaction2/pull/891))
+- Support Jester dataset ([#864](https://github.com/open-mmlab/mmaction2/pull/864))
+
+**Improvements**
+
+- Add `metric_options` for evaluation to docs ([#873](https://github.com/open-mmlab/mmaction2/pull/873))
+- Support creating a new label map based on custom classes for demos about spatio temporal demo ([#879](https://github.com/open-mmlab/mmaction2/pull/879))
+- Improve document about AVA dataset preparation ([#878](https://github.com/open-mmlab/mmaction2/pull/878))
+- Provide a script to extract clip-level feature ([#856](https://github.com/open-mmlab/mmaction2/pull/856))
+
+**Bug and Typo Fixes**
+
+- Fix issues about resume ([#877](https://github.com/open-mmlab/mmaction2/pull/877), [#878](https://github.com/open-mmlab/mmaction2/pull/878))
+- Correct the key name of `eval_results` dictionary for metric 'mmit_mean_average_precision' ([#885](https://github.com/open-mmlab/mmaction2/pull/885))
+
+**ModelZoo**
+
+- Support Jester dataset ([#864](https://github.com/open-mmlab/mmaction2/pull/864))
+- Support ACRN and Focal Loss ([#891](https://github.com/open-mmlab/mmaction2/pull/891))
+
+## 0.14.0 (30/04/2021)
+
+**Highlights**
+
+- Support TRN
+- Support Diving48
+
+**New Features**
+
+- Support TRN ([#755](https://github.com/open-mmlab/mmaction2/pull/755))
+- Support Diving48 ([#835](https://github.com/open-mmlab/mmaction2/pull/835))
+- Support Webcam Demo for Spatio-temporal Action Detection Models ([#795](https://github.com/open-mmlab/mmaction2/pull/795))
+
+**Improvements**
+
+- Add softmax option for pytorch2onnx tool ([#781](https://github.com/open-mmlab/mmaction2/pull/781))
+- Support TRN ([#755](https://github.com/open-mmlab/mmaction2/pull/755))
+- Test with onnx models and TensorRT engines ([#758](https://github.com/open-mmlab/mmaction2/pull/758))
+- Speed up AVA Testing ([#784](https://github.com/open-mmlab/mmaction2/pull/784))
+- Add `self.with_neck` attribute ([#796](https://github.com/open-mmlab/mmaction2/pull/796))
+- Update installation document ([#798](https://github.com/open-mmlab/mmaction2/pull/798))
+- Use a random master port ([#809](https://github.com/open-mmlab/mmaction2/pull/8098))
+- Update AVA processing data document ([#801](https://github.com/open-mmlab/mmaction2/pull/801))
+- Refactor spatio-temporal augmentation ([#782](https://github.com/open-mmlab/mmaction2/pull/782))
+- Add QR code in CN README ([#812](https://github.com/open-mmlab/mmaction2/pull/812))
+- Add Alternative way to download Kinetics ([#817](https://github.com/open-mmlab/mmaction2/pull/817), [#822](https://github.com/open-mmlab/mmaction2/pull/822))
+- Refactor Sampler ([#790](https://github.com/open-mmlab/mmaction2/pull/790))
+- Use EvalHook in MMCV with backward compatibility ([#793](https://github.com/open-mmlab/mmaction2/pull/793))
+- Use MMCV Model Registry ([#843](https://github.com/open-mmlab/mmaction2/pull/843))
+
+**Bug and Typo Fixes**
+
+- Fix a bug in pytorch2onnx.py when `num_classes <= 4` ([#800](https://github.com/open-mmlab/mmaction2/pull/800), [#824](https://github.com/open-mmlab/mmaction2/pull/824))
+- Fix `demo_spatiotemporal_det.py` error ([#803](https://github.com/open-mmlab/mmaction2/pull/803), [#805](https://github.com/open-mmlab/mmaction2/pull/805))
+- Fix loading config bugs when resume ([#820](https://github.com/open-mmlab/mmaction2/pull/820))
+- Make HMDB51 annotation generation more robust ([#811](https://github.com/open-mmlab/mmaction2/pull/811))
+
+**ModelZoo**
+
+- Update checkpoint for 256 height in something-V2 ([#789](https://github.com/open-mmlab/mmaction2/pull/789))
+- Support Diving48 ([#835](https://github.com/open-mmlab/mmaction2/pull/835))
+
+## 0.13.0 (31/03/2021)
+
+**Highlights**
+
+- Support LFB
+- Support using backbone from MMCls/TorchVision
+- Add Chinese documentation
+
+**New Features**
+
+- Support LFB ([#553](https://github.com/open-mmlab/mmaction2/pull/553))
+- Support using backbones from MMCls for TSN ([#679](https://github.com/open-mmlab/mmaction2/pull/679))
+- Support using backbones from TorchVision for TSN ([#720](https://github.com/open-mmlab/mmaction2/pull/720))
+- Support Mixup and Cutmix for recognizers ([#681](https://github.com/open-mmlab/mmaction2/pull/681))
+- Support Chinese documentation ([#665](https://github.com/open-mmlab/mmaction2/pull/665), [#680](https://github.com/open-mmlab/mmaction2/pull/680), [#689](https://github.com/open-mmlab/mmaction2/pull/689), [#701](https://github.com/open-mmlab/mmaction2/pull/701), [#702](https://github.com/open-mmlab/mmaction2/pull/702), [#703](https://github.com/open-mmlab/mmaction2/pull/703), [#706](https://github.com/open-mmlab/mmaction2/pull/706), [#716](https://github.com/open-mmlab/mmaction2/pull/716), [#717](https://github.com/open-mmlab/mmaction2/pull/717), [#731](https://github.com/open-mmlab/mmaction2/pull/731), [#733](https://github.com/open-mmlab/mmaction2/pull/733), [#735](https://github.com/open-mmlab/mmaction2/pull/735), [#736](https://github.com/open-mmlab/mmaction2/pull/736), [#737](https://github.com/open-mmlab/mmaction2/pull/737), [#738](https://github.com/open-mmlab/mmaction2/pull/738), [#739](https://github.com/open-mmlab/mmaction2/pull/739), [#740](https://github.com/open-mmlab/mmaction2/pull/740), [#742](https://github.com/open-mmlab/mmaction2/pull/742), [#752](https://github.com/open-mmlab/mmaction2/pull/752), [#759](https://github.com/open-mmlab/mmaction2/pull/759), [#761](https://github.com/open-mmlab/mmaction2/pull/761), [#772](https://github.com/open-mmlab/mmaction2/pull/772), [#775](https://github.com/open-mmlab/mmaction2/pull/775))
+
+**Improvements**
+
+- Add slowfast config/json/log/ckpt for training custom classes of AVA ([#678](https://github.com/open-mmlab/mmaction2/pull/678))
+- Set RandAugment as Imgaug default transforms ([#585](https://github.com/open-mmlab/mmaction2/pull/585))
+- Add `--test-last` & `--test-best` for `tools/train.py` to test checkpoints after training ([#608](https://github.com/open-mmlab/mmaction2/pull/608))
+- Add fcn_testing in TPN ([#684](https://github.com/open-mmlab/mmaction2/pull/684))
+- Remove redundant recall functions ([#741](https://github.com/open-mmlab/mmaction2/pull/741))
+- Recursively remove pretrained step for testing ([#695](https://github.com/open-mmlab/mmaction2/pull/695))
+- Improve demo by limiting inference fps ([#668](https://github.com/open-mmlab/mmaction2/pull/668))
+
+**Bug and Typo Fixes**
+
+- Fix a bug about multi-class in VideoDataset ([#723](https://github.com/open-mmlab/mmaction2/pull/678))
+- Reverse key-value in anet filelist generation ([#686](https://github.com/open-mmlab/mmaction2/pull/686))
+- Fix flow norm cfg typo ([#693](https://github.com/open-mmlab/mmaction2/pull/693))
+
+**ModelZoo**
+
+- Add LFB for AVA2.1 ([#553](https://github.com/open-mmlab/mmaction2/pull/553))
+- Add TSN with ResNeXt-101-32x4d backbone as an example for using MMCls backbones ([#679](https://github.com/open-mmlab/mmaction2/pull/679))
+- Add TSN with Densenet161 backbone as an example for using TorchVision backbones ([#720](https://github.com/open-mmlab/mmaction2/pull/720))
+- Add slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb ([#690](https://github.com/open-mmlab/mmaction2/pull/690))
+- Add slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb ([#704](https://github.com/open-mmlab/mmaction2/pull/704))
+- Add slowonly_nl_kinetics_pretrained_r50_4x16x1(8x8x1)\_20e_ava_rgb ([#730](https://github.com/open-mmlab/mmaction2/pull/730))
+
+## 0.12.0 (28/02/2021)
+
+**Highlights**
+
+- Support TSM-MobileNetV2
+- Support TANet
+- Support GPU Normalize
+
+**New Features**
+
+- Support TSM-MobileNetV2 ([#415](https://github.com/open-mmlab/mmaction2/pull/415))
+- Support flip with label mapping ([#591](https://github.com/open-mmlab/mmaction2/pull/591))
+- Add seed option for sampler ([#642](https://github.com/open-mmlab/mmaction2/pull/642))
+- Support GPU Normalize ([#586](https://github.com/open-mmlab/mmaction2/pull/586))
+- Support TANet ([#595](https://github.com/open-mmlab/mmaction2/pull/595))
+
+**Improvements**
+
+- Training custom classes of ava dataset ([#555](https://github.com/open-mmlab/mmaction2/pull/555))
+- Add CN README in homepage ([#592](https://github.com/open-mmlab/mmaction2/pull/592), [#594](https://github.com/open-mmlab/mmaction2/pull/594))
+- Support soft label for CrossEntropyLoss ([#625](https://github.com/open-mmlab/mmaction2/pull/625))
+- Refactor config: Specify `train_cfg` and `test_cfg` in `model` ([#629](https://github.com/open-mmlab/mmaction2/pull/629))
+- Provide an alternative way to download older kinetics annotations ([#597](https://github.com/open-mmlab/mmaction2/pull/597))
+- Update FAQ for
+  - 1). data pipeline about video and frames ([#598](https://github.com/open-mmlab/mmaction2/pull/598))
+  - 2). how to show results ([#598](https://github.com/open-mmlab/mmaction2/pull/598))
+  - 3). batch size setting for batchnorm ([#657](https://github.com/open-mmlab/mmaction2/pull/657))
+  - 4). how to fix stages of backbone when finetuning models ([#658](https://github.com/open-mmlab/mmaction2/pull/658))
+- Modify default value of `save_best` ([#600](https://github.com/open-mmlab/mmaction2/pull/600))
+- Use BibTex rather than latex in markdown ([#607](https://github.com/open-mmlab/mmaction2/pull/607))
+- Add warnings of uninstalling mmdet and supplementary documents ([#624](https://github.com/open-mmlab/mmaction2/pull/624))
+- Support soft label for CrossEntropyLoss ([#625](https://github.com/open-mmlab/mmaction2/pull/625))
+
+**Bug and Typo Fixes**
+
+- Fix value of `pem_low_temporal_iou_threshold` in BSN ([#556](https://github.com/open-mmlab/mmaction2/pull/556))
+- Fix ActivityNet download script ([#601](https://github.com/open-mmlab/mmaction2/pull/601))
+
+**ModelZoo**
+
+- Add TSM-MobileNetV2 for Kinetics400 ([#415](https://github.com/open-mmlab/mmaction2/pull/415))
+- Add deeper SlowFast models ([#605](https://github.com/open-mmlab/mmaction2/pull/605))
+
+## 0.11.0 (31/01/2021)
+
+**Highlights**
+
+- Support imgaug
+- Support spatial temporal demo
+- Refactor EvalHook, config structure, unittest structure
+
+**New Features**
+
+- Support [imgaug](https://imgaug.readthedocs.io/en/latest/index.html) for augmentations in the data pipeline ([#492](https://github.com/open-mmlab/mmaction2/pull/492))
+- Support setting `max_testing_views` for extremely large models to save GPU memory used ([#511](https://github.com/open-mmlab/mmaction2/pull/511))
+- Add spatial temporal demo ([#547](https://github.com/open-mmlab/mmaction2/pull/547), [#566](https://github.com/open-mmlab/mmaction2/pull/566))
+
+**Improvements**
+
+- Refactor EvalHook ([#395](https://github.com/open-mmlab/mmaction2/pull/395))
+- Refactor AVA hook ([#567](https://github.com/open-mmlab/mmaction2/pull/567))
+- Add repo citation ([#545](https://github.com/open-mmlab/mmaction2/pull/545))
+- Add dataset size of Kinetics400 ([#503](https://github.com/open-mmlab/mmaction2/pull/503))
+- Add lazy operation docs ([#504](https://github.com/open-mmlab/mmaction2/pull/504))
+- Add class_weight for CrossEntropyLoss and BCELossWithLogits ([#509](https://github.com/open-mmlab/mmaction2/pull/509))
+- add some explanation about the resampling in slowfast ([#502](https://github.com/open-mmlab/mmaction2/pull/502))
+- Modify paper title in README.md ([#512](https://github.com/open-mmlab/mmaction2/pull/512))
+- Add alternative ways to download Kinetics ([#521](https://github.com/open-mmlab/mmaction2/pull/521))
+- Add OpenMMLab projects link in README ([#530](https://github.com/open-mmlab/mmaction2/pull/530))
+- Change default preprocessing to shortedge to 256 ([#538](https://github.com/open-mmlab/mmaction2/pull/538))
+- Add config tag in dataset README ([#540](https://github.com/open-mmlab/mmaction2/pull/540))
+- Add solution for markdownlint installation issue ([#497](https://github.com/open-mmlab/mmaction2/pull/497))
+- Add dataset overview in readthedocs ([#548](https://github.com/open-mmlab/mmaction2/pull/548))
+- Modify the trigger mode of the warnings of missing mmdet ([#583](https://github.com/open-mmlab/mmaction2/pull/583))
+- Refactor config structure ([#488](https://github.com/open-mmlab/mmaction2/pull/488), [#572](https://github.com/open-mmlab/mmaction2/pull/572))
+- Refactor unittest structure ([#433](https://github.com/open-mmlab/mmaction2/pull/433))
+
+**Bug and Typo Fixes**
+
+- Fix a bug about ava dataset validation ([#527](https://github.com/open-mmlab/mmaction2/pull/527))
+- Fix a bug about ResNet pretrain weight initialization ([#582](https://github.com/open-mmlab/mmaction2/pull/582))
+- Fix a bug in CI due to MMCV index ([#495](https://github.com/open-mmlab/mmaction2/pull/495))
+- Remove invalid links of MiT and MMiT ([#516](https://github.com/open-mmlab/mmaction2/pull/516))
+- Fix frame rate bug for AVA preparation ([#576](https://github.com/open-mmlab/mmaction2/pull/576))
+
+**ModelZoo**
+
+## 0.10.0 (31/12/2020)
+
+**Highlights**
+
+- Support Spatio-Temporal Action Detection (AVA)
+- Support precise BN
+
+**New Features**
+
+- Support precise BN ([#501](https://github.com/open-mmlab/mmaction2/pull/501/))
+- Support Spatio-Temporal Action Detection (AVA) ([#351](https://github.com/open-mmlab/mmaction2/pull/351))
+- Support to return feature maps in `inference_recognizer` ([#458](https://github.com/open-mmlab/mmaction2/pull/458))
+
+**Improvements**
+
+- Add arg `stride` to long_video_demo.py, to make inference faster ([#468](https://github.com/open-mmlab/mmaction2/pull/468))
+- Support training and testing for Spatio-Temporal Action Detection ([#351](https://github.com/open-mmlab/mmaction2/pull/351))
+- Fix CI due to pip upgrade ([#454](https://github.com/open-mmlab/mmaction2/pull/454))
+- Add markdown lint in pre-commit hook ([#255](https://github.com/open-mmlab/mmaction2/pull/225))
+- Speed up confusion matrix calculation ([#465](https://github.com/open-mmlab/mmaction2/pull/465))
+- Use title case in modelzoo statistics ([#456](https://github.com/open-mmlab/mmaction2/pull/456))
+- Add FAQ documents for easy troubleshooting. ([#413](https://github.com/open-mmlab/mmaction2/pull/413), [#420](https://github.com/open-mmlab/mmaction2/pull/420), [#439](https://github.com/open-mmlab/mmaction2/pull/439))
+- Support Spatio-Temporal Action Detection with context ([#471](https://github.com/open-mmlab/mmaction2/pull/471))
+- Add class weight for CrossEntropyLoss and BCELossWithLogits ([#509](https://github.com/open-mmlab/mmaction2/pull/509))
+- Add Lazy OPs docs ([#504](https://github.com/open-mmlab/mmaction2/pull/504))
+
+**Bug and Typo Fixes**
+
+- Fix typo in default argument of BaseHead ([#446](https://github.com/open-mmlab/mmaction2/pull/446))
+- Fix potential bug about `output_config` overwrite ([#463](https://github.com/open-mmlab/mmaction2/pull/463))
+
+**ModelZoo**
+
+- Add SlowOnly, SlowFast for AVA2.1 ([#351](https://github.com/open-mmlab/mmaction2/pull/351))
+
+## 0.9.0 (30/11/2020)
+
+**Highlights**
+
+- Support GradCAM utils for recognizers
+- Support ResNet Audio model
+
+**New Features**
+
+- Automatically add modelzoo statistics to readthedocs ([#327](https://github.com/open-mmlab/mmaction2/pull/327))
+- Support GYM99 ([#331](https://github.com/open-mmlab/mmaction2/pull/331), [#336](https://github.com/open-mmlab/mmaction2/pull/336))
+- Add AudioOnly Pathway from AVSlowFast. ([#355](https://github.com/open-mmlab/mmaction2/pull/355))
+- Add GradCAM utils for recognizer ([#324](https://github.com/open-mmlab/mmaction2/pull/324))
+- Add print config script ([#345](https://github.com/open-mmlab/mmaction2/pull/345))
+- Add online motion vector decoder ([#291](https://github.com/open-mmlab/mmaction2/pull/291))
+
+**Improvements**
+
+- Support PyTorch 1.7 in CI ([#312](https://github.com/open-mmlab/mmaction2/pull/312))
+- Support to predict different labels in a long video ([#274](https://github.com/open-mmlab/mmaction2/pull/274))
+- Update docs bout test crops ([#359](https://github.com/open-mmlab/mmaction2/pull/359))
+- Polish code format using pylint manually ([#338](https://github.com/open-mmlab/mmaction2/pull/338))
+- Update unittest coverage ([#358](https://github.com/open-mmlab/mmaction2/pull/358), [#322](https://github.com/open-mmlab/mmaction2/pull/322), [#325](https://github.com/open-mmlab/mmaction2/pull/325))
+- Add random seed for building filelists ([#323](https://github.com/open-mmlab/mmaction2/pull/323))
+- Update colab tutorial ([#367](https://github.com/open-mmlab/mmaction2/pull/367))
+- set default batch_size of evaluation and testing to 1 ([#250](https://github.com/open-mmlab/mmaction2/pull/250))
+- Rename the preparation docs to `README.md` ([#388](https://github.com/open-mmlab/mmaction2/pull/388))
+- Move docs about demo to `demo/README.md` ([#329](https://github.com/open-mmlab/mmaction2/pull/329))
+- Remove redundant code in `tools/test.py` ([#310](https://github.com/open-mmlab/mmaction2/pull/310))
+- Automatically calculate number of test clips for Recognizer2D ([#359](https://github.com/open-mmlab/mmaction2/pull/359))
+
+**Bug and Typo Fixes**
+
+- Fix rename Kinetics classnames bug ([#384](https://github.com/open-mmlab/mmaction2/pull/384))
+- Fix a bug in BaseDataset when `data_prefix` is None ([#314](https://github.com/open-mmlab/mmaction2/pull/314))
+- Fix a bug about `tmp_folder` in `OpenCVInit` ([#357](https://github.com/open-mmlab/mmaction2/pull/357))
+- Fix `get_thread_id` when not using disk as backend ([#354](https://github.com/open-mmlab/mmaction2/pull/354), [#357](https://github.com/open-mmlab/mmaction2/pull/357))
+- Fix the bug of HVU object `num_classes` from 1679 to 1678 ([#307](https://github.com/open-mmlab/mmaction2/pull/307))
+- Fix typo in `export_model.md` ([#399](https://github.com/open-mmlab/mmaction2/pull/399))
+- Fix OmniSource training configs ([#321](https://github.com/open-mmlab/mmaction2/pull/321))
+- Fix Issue #306: Bug of SampleAVAFrames ([#317](https://github.com/open-mmlab/mmaction2/pull/317))
+
+**ModelZoo**
+
+- Add SlowOnly model for GYM99, both RGB and Flow ([#336](https://github.com/open-mmlab/mmaction2/pull/336))
+- Add auto modelzoo statistics in readthedocs ([#327](https://github.com/open-mmlab/mmaction2/pull/327))
+- Add TSN for HMDB51 pretrained on Kinetics400, Moments in Time and ImageNet. ([#372](https://github.com/open-mmlab/mmaction2/pull/372))
+
+## v0.8.0 (31/10/2020)
+
+**Highlights**
+
+- Support [OmniSource](https://arxiv.org/abs/2003.13042)
+- Support C3D
+- Support video recognition with audio modality
+- Support HVU
+- Support X3D
+
+**New Features**
+
+- Support AVA dataset preparation ([#266](https://github.com/open-mmlab/mmaction2/pull/266))
+- Support the training of video recognition dataset with multiple tag categories ([#235](https://github.com/open-mmlab/mmaction2/pull/235))
+- Support joint training with multiple training datasets of multiple formats, including images, untrimmed videos, etc. ([#242](https://github.com/open-mmlab/mmaction2/pull/242))
+- Support to specify a start epoch to conduct evaluation ([#216](https://github.com/open-mmlab/mmaction2/pull/216))
+- Implement X3D models, support testing with model weights converted from SlowFast ([#288](https://github.com/open-mmlab/mmaction2/pull/288))
+- Support specify a start epoch to conduct evaluation ([#216](https://github.com/open-mmlab/mmaction2/pull/216))
+
+**Improvements**
+
+- Set default values of 'average_clips' in each config file so that there is no need to set it explicitly during testing in most cases ([#232](https://github.com/open-mmlab/mmaction2/pull/232))
+- Extend HVU datatools to generate individual file list for each tag category ([#258](https://github.com/open-mmlab/mmaction2/pull/258))
+- Support data preparation for Kinetics-600 and Kinetics-700 ([#254](https://github.com/open-mmlab/mmaction2/pull/254))
+- Use `metric_dict` to replace hardcoded arguments in `evaluate` function ([#286](https://github.com/open-mmlab/mmaction2/pull/286))
+- Add `cfg-options` in arguments to override some settings in the used config for convenience ([#212](https://github.com/open-mmlab/mmaction2/pull/212))
+- Rename the old evaluating protocol `mean_average_precision` as `mmit_mean_average_precision` since it is only used on MMIT and is not the `mAP` we usually talk about. Add `mean_average_precision`, which is the real `mAP` ([#235](https://github.com/open-mmlab/mmaction2/pull/235))
+- Add accurate setting (Three crop * 2 clip) and report corresponding performance for TSM model ([#241](https://github.com/open-mmlab/mmaction2/pull/241))
+- Add citations in each preparing_dataset.md in `tools/data/dataset` ([#289](https://github.com/open-mmlab/mmaction2/pull/289))
+- Update the performance of audio-visual fusion on Kinetics-400 ([#281](https://github.com/open-mmlab/mmaction2/pull/281))
+- Support data preparation of OmniSource web datasets, including GoogleImage, InsImage, InsVideo and KineticsRawVideo ([#294](https://github.com/open-mmlab/mmaction2/pull/294))
+- Use `metric_options` dict to provide metric args in `evaluate` ([#286](https://github.com/open-mmlab/mmaction2/pull/286))
+
+**Bug Fixes**
+
+- Register `FrameSelector` in `PIPELINES` ([#268](https://github.com/open-mmlab/mmaction2/pull/268))
+- Fix the potential bug for default value in dataset_setting ([#245](https://github.com/open-mmlab/mmaction2/pull/245))
+- Fix multi-node dist test ([#292](https://github.com/open-mmlab/mmaction2/pull/292))
+- Fix the data preparation bug for `something-something` dataset ([#278](https://github.com/open-mmlab/mmaction2/pull/278))
+- Fix the invalid config url in slowonly README data benchmark ([#249](https://github.com/open-mmlab/mmaction2/pull/249))
+- Validate that the performance of models trained with videos have no significant difference comparing to the performance of models trained with rawframes ([#256](https://github.com/open-mmlab/mmaction2/pull/256))
+- Correct the `img_norm_cfg` used by TSN-3seg-R50 UCF-101 model, improve the Top-1 accuracy by 3% ([#273](https://github.com/open-mmlab/mmaction2/pull/273))
+
+**ModelZoo**
+
+- Add Baselines for Kinetics-600 and Kinetics-700, including TSN-R50-8seg and SlowOnly-R50-8x8 ([#259](https://github.com/open-mmlab/mmaction2/pull/259))
+- Add OmniSource benchmark on MiniKineitcs ([#296](https://github.com/open-mmlab/mmaction2/pull/296))
+- Add Baselines for HVU, including TSN-R18-8seg on 6 tag categories of HVU ([#287](https://github.com/open-mmlab/mmaction2/pull/287))
+- Add X3D models ported from [SlowFast](https://github.com/facebookresearch/SlowFast/) ([#288](https://github.com/open-mmlab/mmaction2/pull/288))
+
+## v0.7.0 (30/9/2020)
+
+**Highlights**
+
+- Support TPN
+- Support JHMDB, UCF101-24, HVU dataset preparation
+- support onnx model conversion
+
+**New Features**
+
+- Support the data pre-processing pipeline for the HVU Dataset ([#277](https://github.com/open-mmlab/mmaction2/pull/227/))
+- Support real-time action recognition from web camera ([#171](https://github.com/open-mmlab/mmaction2/pull/171))
+- Support onnx ([#160](https://github.com/open-mmlab/mmaction2/pull/160))
+- Support UCF101-24 preparation ([#219](https://github.com/open-mmlab/mmaction2/pull/219))
+- Support evaluating mAP for ActivityNet with [CUHK17_activitynet_pred](http://activity-net.org/challenges/2017/evaluation.html) ([#176](https://github.com/open-mmlab/mmaction2/pull/176))
+- Add the data pipeline for ActivityNet, including downloading videos, extracting RGB and Flow frames, finetuning TSN and extracting feature ([#190](https://github.com/open-mmlab/mmaction2/pull/190))
+- Support JHMDB preparation ([#220](https://github.com/open-mmlab/mmaction2/pull/220))
+
+**ModelZoo**
+
+- Add finetuning setting for SlowOnly ([#173](https://github.com/open-mmlab/mmaction2/pull/173))
+- Add TSN and SlowOnly models trained with [OmniSource](https://arxiv.org/abs/2003.13042), which achieve 75.7% Top-1 with TSN-R50-3seg and 80.4% Top-1 with SlowOnly-R101-8x8 ([#215](https://github.com/open-mmlab/mmaction2/pull/215))
+
+**Improvements**
+
+- Support demo with video url ([#165](https://github.com/open-mmlab/mmaction2/pull/165))
+- Support multi-batch when testing ([#184](https://github.com/open-mmlab/mmaction2/pull/184))
+- Add tutorial for adding a new learning rate updater ([#181](https://github.com/open-mmlab/mmaction2/pull/181))
+- Add config name in meta info ([#183](https://github.com/open-mmlab/mmaction2/pull/183))
+- Remove git hash in `__version__` ([#189](https://github.com/open-mmlab/mmaction2/pull/189))
+- Check mmcv version ([#189](https://github.com/open-mmlab/mmaction2/pull/189))
+- Update url with 'https://download.openmmlab.com' ([#208](https://github.com/open-mmlab/mmaction2/pull/208))
+- Update Docker file to support PyTorch 1.6 and update `install.md` ([#209](https://github.com/open-mmlab/mmaction2/pull/209))
+- Polish readsthedocs display ([#217](https://github.com/open-mmlab/mmaction2/pull/217), [#229](https://github.com/open-mmlab/mmaction2/pull/229))
+
+**Bug Fixes**
+
+- Fix the bug when using OpenCV to extract only RGB frames with original shape ([#184](https://github.com/open-mmlab/mmaction2/pull/187))
+- Fix the bug of sthv2 `num_classes` from 339 to 174 ([#174](https://github.com/open-mmlab/mmaction2/pull/174), [#207](https://github.com/open-mmlab/mmaction2/pull/207))
+
+## v0.6.0 (2/9/2020)
+
+**Highlights**
+
+- Support TIN, CSN, SSN, NonLocal
+- Support FP16 training
+
+**New Features**
+
+- Support NonLocal module and provide ckpt in TSM and I3D ([#41](https://github.com/open-mmlab/mmaction2/pull/41))
+- Support SSN ([#33](https://github.com/open-mmlab/mmaction2/pull/33), [#37](https://github.com/open-mmlab/mmaction2/pull/37), [#52](https://github.com/open-mmlab/mmaction2/pull/52), [#55](https://github.com/open-mmlab/mmaction2/pull/55))
+- Support CSN ([#87](https://github.com/open-mmlab/mmaction2/pull/87))
+- Support TIN ([#53](https://github.com/open-mmlab/mmaction2/pull/53))
+- Support HMDB51 dataset preparation ([#60](https://github.com/open-mmlab/mmaction2/pull/60))
+- Support encoding videos from frames ([#84](https://github.com/open-mmlab/mmaction2/pull/84))
+- Support FP16 training ([#25](https://github.com/open-mmlab/mmaction2/pull/25))
+- Enhance demo by supporting rawframe inference ([#59](https://github.com/open-mmlab/mmaction2/pull/59)), output video/gif ([#72](https://github.com/open-mmlab/mmaction2/pull/72))
+
+**ModelZoo**
+
+- Update Slowfast modelzoo ([#51](https://github.com/open-mmlab/mmaction2/pull/51))
+- Update TSN, TSM video checkpoints ([#50](https://github.com/open-mmlab/mmaction2/pull/50))
+- Add data benchmark for TSN ([#57](https://github.com/open-mmlab/mmaction2/pull/57))
+- Add data benchmark for SlowOnly ([#77](https://github.com/open-mmlab/mmaction2/pull/77))
+- Add BSN/BMN performance results with feature extracted by our codebase ([#99](https://github.com/open-mmlab/mmaction2/pull/99))
+
+**Improvements**
+
+- Polish data preparation codes ([#70](https://github.com/open-mmlab/mmaction2/pull/70))
+- Improve data preparation scripts ([#58](https://github.com/open-mmlab/mmaction2/pull/58))
+- Improve unittest coverage and minor fix ([#62](https://github.com/open-mmlab/mmaction2/pull/62))
+- Support PyTorch 1.6 in CI ([#117](https://github.com/open-mmlab/mmaction2/pull/117))
+- Support `with_offset` for rawframe dataset ([#48](https://github.com/open-mmlab/mmaction2/pull/48))
+- Support json annotation files ([#119](https://github.com/open-mmlab/mmaction2/pull/119))
+- Support `multi-class` in TSMHead ([#104](https://github.com/open-mmlab/mmaction2/pull/104))
+- Support using `val_step()` to validate data for each `val` workflow ([#123](https://github.com/open-mmlab/mmaction2/pull/123))
+- Use `xxInit()` method to get `total_frames` and make `total_frames` a required key ([#90](https://github.com/open-mmlab/mmaction2/pull/90))
+- Add paper introduction in model readme ([#140](https://github.com/open-mmlab/mmaction2/pull/140))
+- Adjust the directory structure of `tools/` and rename some scripts files ([#142](https://github.com/open-mmlab/mmaction2/pull/142))
+
+**Bug Fixes**
+
+- Fix configs for localization test ([#67](https://github.com/open-mmlab/mmaction2/pull/67))
+- Fix configs of SlowOnly by fixing lr to 8 gpus ([#136](https://github.com/open-mmlab/mmaction2/pull/136))
+- Fix the bug in analyze_log ([#54](https://github.com/open-mmlab/mmaction2/pull/54))
+- Fix the bug of generating HMDB51 class index file ([#69](https://github.com/open-mmlab/mmaction2/pull/69))
+- Fix the bug of using `load_checkpoint()` in ResNet ([#93](https://github.com/open-mmlab/mmaction2/pull/93))
+- Fix the bug of `--work-dir` when using slurm training script ([#110](https://github.com/open-mmlab/mmaction2/pull/110))
+- Correct the sthv1/sthv2 rawframes filelist generate command ([#71](https://github.com/open-mmlab/mmaction2/pull/71))
+- `CosineAnnealing` typo ([#47](https://github.com/open-mmlab/mmaction2/pull/47))
+
+## v0.5.0 (9/7/2020)
+
+**Highlights**
+
+- MMAction2 is released
+
+**New Features**
+
+- Support various datasets: UCF101, Kinetics-400, Something-Something V1&V2, Moments in Time,
+  Multi-Moments in Time, THUMOS14
+- Support various action recognition methods: TSN, TSM, R(2+1)D, I3D, SlowOnly, SlowFast, Non-local
+- Support various action localization methods: BSN, BMN
+- Colab demo for action recognition
diff --git a/docs/en/notes/ecosystem.md b/docs/en/notes/ecosystem.md
new file mode 100644
index 0000000000000000000000000000000000000000..3f1b2a784d7e20a5cae0d95e0d3838ba20d3feb7
--- /dev/null
+++ b/docs/en/notes/ecosystem.md
@@ -0,0 +1,24 @@
+# Ecosystem Projects based on MMAction2
+
+There are many research works and projects built on MMAction2.
+We list some of them as examples of how to extend MMAction2 for your own projects.
+As the page might not be completed, please feel free to create a PR to update this page.
+
+## Projects as an extension
+
+- [OTEAction2](https://github.com/openvinotoolkit/mmaction2): OpenVINO Training Extensions for Action Recognition.
+- [PYSKL](https://github.com/kennymckormick/pyskl): A Toolbox Focusing on Skeleton-Based Action Recognition.
+
+## Projects of papers
+
+There are also projects released with papers.
+Some of the papers are published in top-tier conferences (CVPR, ICCV, and ECCV), the others are also highly influential.
+To make this list also a reference for the community to develop and compare new video understanding algorithms, we list them following the time order of top-tier conferences.
+Methods already supported and maintained by MMAction2 are not listed.
+
+- Video Swin Transformer, CVPR 2022. [\[paper\]](https://arxiv.org/abs/2106.13230)[\[github\]](https://github.com/SwinTransformer/Video-Swin-Transformer)
+- Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 Oral. [\[paper\]](https://arxiv.org/abs/2107.10161)[\[github\]](https://github.com/Cogito2012/DEAR)
+- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 Oral. [\[paper\]](https://arxiv.org/abs/2103.17263)[\[github\]](https://github.com/xvjiarui/VFS)
+- MGSampler: An Explainable Sampling Strategy for Video Action Recognition, ICCV 2021. [\[paper\]](https://arxiv.org/abs/2104.09952)[\[github\]](https://github.com/MCG-NJU/MGSampler)
+- MultiSports: A Multi-Person Video Dataset of Spatio-Temporally Localized Sports Actions, ICCV 2021. [\[paper\]](https://arxiv.org/abs/2105.07404)
+- Long Short-Term Transformer for Online Action Detection, NeurIPS 2021 [\[paper\]](https://arxiv.org/abs/2107.03377)[\[github\]](https://github.com/amazon-research/long-short-term-transformer)
diff --git a/docs/en/notes/pytorch2.0.md b/docs/en/notes/pytorch2.0.md
new file mode 100644
index 0000000000000000000000000000000000000000..09499beacd30f21384ebf64ab62e2607a2675d11
--- /dev/null
+++ b/docs/en/notes/pytorch2.0.md
@@ -0,0 +1,21 @@
+# PyTorch 2.0 Compatibility and Benchmark
+
+PyTorch introduced `torch.compile` in its 2.0 release. It compiles your model to speedup trainning & validation. We provide a benchmark result and compatibility of typical models in MMAction2. Except for one model (MViT) that fails to compile, the performance of other models remains consistent before and after compilation.
+
+| Config                                                                    | compiled | Train time / iter (s) | GPU memory (M) | test metric  |
+| ------------------------------------------------------------------------- | -------- | --------------------- | -------------- | ------------ |
+| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb                    | False    | 0.50                  | 42537          | 36.55        |
+| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb                    | True     | 0.61                  | 53149          | 36.72        |
+| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb                         | False    | 0.688                 | 14263          | 77.69        |
+| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb                         | True     | 0.691                 | 13863          | 77.57        |
+| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d                          | False    | 0.0305                | 1184           | 91.69        |
+| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d                          | True     | 0.0298                | 1273           | 91.64        |
+| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint                           | False    | 0.498                 | 9581           | 93.6         |
+| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint                           | True     | 0.505                 | 11968          | 93.49        |
+| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb            | False    | 0.17                  | 8278           | 20.76        |
+| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb            | True     | 0.1835                | 12004          | 21.67        |
+| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb          | False    | 0.323                 | 21651          | 78.90        |
+| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb          | True     | 0.262                 | 20905          | 78.70        |
+| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | False    | 0.098                 | 5777           | 75.12        |
+| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | True     | 0.0942                | 7095           | 75.15        |
+| mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb                        | Fail     | incompatible          | incompatible   | incompatible |
diff --git a/docs/en/project_zoo.py b/docs/en/project_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef5909e41e5680a7d12c98177df4e031e7c55bd1
--- /dev/null
+++ b/docs/en/project_zoo.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+from pathlib import Path
+
+from utils import replace_link
+
+# This script reads /projects/*/README.md and generate projectzoo.md
+
+all_files = list(Path('../../projects/').glob('*/README.md'))
+example_project = '../../projects/example_project/README.md'
+all_files.remove(Path(example_project))
+all_files.insert(0, Path(example_project))
+
+project_zoo = open('../../projects/README.md').read()
+for file in all_files:
+    with open(file) as f:
+        content = f.read()
+        content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content,
+                               file)
+        content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content,
+                               file)
+
+        project_zoo += content
+
+with open('projectzoo.md', 'w') as f:
+    f.write(project_zoo)
diff --git a/docs/en/stat.py b/docs/en/stat.py
new file mode 100644
index 0000000000000000000000000000000000000000..350fa113e9bd80a8a6b81cd2874a138f7e7cbe06
--- /dev/null
+++ b/docs/en/stat.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python
+import re
+import shutil
+from collections import defaultdict
+from pathlib import Path
+
+from modelindex.load_model_index import load
+from modelindex.models.Result import Result
+from tabulate import tabulate
+from utils import replace_link
+
+MMACT_ROOT = Path(__file__).absolute().parents[2]
+PAPERS_ROOT = Path('model_zoo')  # Path to save generated paper pages.
+GITHUB_PREFIX = 'https://github.com/open-mmlab/mmaction2/blob/main/'
+MODELZOO_TEMPLATE = """\
+# Model Zoo Summary
+
+In this page, we list [all algorithms](#all-supported-algorithms) we support. You can click the link to jump to the corresponding model pages.
+
+And we also list all checkpoints for different tasks we provide. You can sort or search checkpoints in the table and click the corresponding link to model pages for more details.
+
+## All supported algorithms
+
+* Number of papers: {num_papers}
+{type_msg}
+
+* Number of checkpoints: {num_ckpts}
+{paper_msg}
+
+"""  # noqa: E501
+
+METRIC_ALIAS = {
+    'Top 1 Accuracy': 'Top-1 (%)',
+    'Top 5 Accuracy': 'Top-5 (%)',
+}
+
+TASK_MAP = dict(
+    detection='Spatio Temporal Action Detection Models',
+    localization='Action Localization Models',
+    recognition='Action Recognition Models',
+    skeleton='Skeleton-based Action Recognition Models',
+    retrieval='Video Retrieval Models',
+    recognition_audio='Audio-based Action Recognition Models')
+
+model_index = load(str(MMACT_ROOT / 'model-index.yml'))
+
+
+def build_collections(model_index):
+    # add models for collections
+    col_by_name = {}
+    for col in model_index.collections:
+        setattr(col, 'models', [])
+        col_by_name[col.name] = col
+
+    for model in model_index.models:
+        col = col_by_name[model.in_collection]
+        col.models.append(model)
+        setattr(model, 'collection', col)
+        if model.results is None:
+            setattr(model, 'tasks', [])
+        else:
+            setattr(model, 'tasks', [result.task for result in model.results])
+
+
+build_collections(model_index)
+
+# save a map from model name to title in README
+model2title = dict()
+
+
+def count_papers(collections):
+    total_num_ckpts = 0
+    type_count = defaultdict(int)
+    paper_msgs = []
+
+    for collection in collections:
+        with open(MMACT_ROOT / collection.readme) as f:
+            readme = f.read()
+
+        ckpts = set(x.lower().strip()
+                    for x in re.findall(r'\[ckpt.*\]\((https?.*)\)', readme))
+        total_num_ckpts += len(ckpts)
+        title = collection.paper['Title']
+        papertype = collection.data.get('type', 'Algorithm')
+        type_count[papertype] += 1
+
+        readme_title = re.search(r'^#\s+.+', readme)
+
+        readme = Path(collection.filepath).parents[1].with_suffix('.md').name
+        model = Path(collection.filepath).parent.name
+        model2title[model] = readme_title.group()[2:].replace(' ', '-')
+        paper_msgs.append(f'\t- [{papertype}] [{title}]({PAPERS_ROOT / readme}'
+                          f'#{model2title[model]}) ({len(ckpts)} ckpts)')
+
+    type_msg = '\n'.join(
+        [f'\t- {type_}: {count}' for type_, count in type_count.items()])
+    paper_msg = '\n'.join(paper_msgs)
+
+    modelzoo = MODELZOO_TEMPLATE.format(
+        num_papers=len(collections),
+        num_ckpts=total_num_ckpts,
+        type_msg=type_msg,
+        paper_msg=paper_msg,
+    )
+
+    with open('modelzoo_statistics.md', 'w') as f:
+        f.write(modelzoo)
+
+
+count_papers(model_index.collections)
+
+
+def generate_paper_page(collection):
+
+    # Write a copy of README
+    with open(MMACT_ROOT / collection.readme) as f:
+        content = f.read()
+    readme_path = Path(collection.filepath)
+    copy = PAPERS_ROOT / readme_path.parents[1].with_suffix('.md').name
+    if not copy.exists():
+        with open(copy, 'w') as copy_file:
+            task = readme_path.parents[1].name
+            head_content = f'# {TASK_MAP[task]}\n'
+            copy_file.write(head_content)
+
+    def lower_heading(match):
+        return '#' + match.group()
+
+    content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content,
+                           Path(collection.readme))
+    content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content,
+                           Path(collection.readme))
+
+    content = re.sub(r'^#+\s+.+', lower_heading, content, flags=re.M)
+
+    with open(copy, 'a') as copy_file:
+        copy_file.write(content)
+
+
+if PAPERS_ROOT.exists():
+    shutil.rmtree(PAPERS_ROOT)
+PAPERS_ROOT.mkdir(exist_ok=True)
+for collection in model_index.collections:
+    generate_paper_page(collection)
+
+
+def scatter_results(models):
+    model_result_pairs = []
+    for model in models:
+        if model.results is None:
+            result = Result(task=None, dataset=None, metrics={})
+            model_result_pairs.append((model, result))
+        else:
+            for result in model.results:
+                model_result_pairs.append((model, result))
+    return model_result_pairs
+
+
+def generate_summary_table(task, model_result_pairs, title=None):
+    metrics = set()
+    for model, result in model_result_pairs:
+        if result.task == task:
+            metrics = metrics.union(result.metrics.keys())
+    metrics = sorted(list(metrics))
+
+    rows = []
+
+    def convert2float(number):
+        units = {'M': 1e6, 'G': 1e9, 'T': 1e12}
+        if isinstance(number, str):
+            num = float(number.rstrip('MGT'))
+            number = num * units[number[-1]]
+        return number
+
+    for model, result in model_result_pairs:
+        if result.task != task:
+            continue
+        name = model.name
+        if model.metadata.parameters is not None:
+            params = convert2float(model.metadata.parameters)
+            params = f'{params / 1e6:.2f}'  # Params
+        else:
+            params = None
+        if model.metadata.flops is not None:
+            flops = convert2float(model.metadata.flops)
+            flops = f'{flops / 1e9:.2f}'  # Flops
+        else:
+            flops = None
+
+        readme = Path(
+            model.collection.filepath).parents[1].with_suffix('.md').name
+        model = Path(model.collection.filepath).parent.name
+        page = f'[link]({PAPERS_ROOT / readme}#{model2title[model]})'
+        model_metrics = []
+        for metric in metrics:
+            model_metrics.append(str(result.metrics.get(metric, '')))
+
+        rows.append([name, params, flops, *model_metrics, page])
+
+    with open('modelzoo_statistics.md', 'a') as f:
+        if title is not None:
+            f.write(f'\n{title}')
+        f.write("""\n```{table}\n:class: model-summary\n""")
+        header = [
+            'Model',
+            'Params (M)',
+            'Flops (G)',
+            *[METRIC_ALIAS.get(metric, metric) for metric in metrics],
+            'Readme',
+        ]
+        table_cfg = dict(
+            tablefmt='pipe',
+            floatfmt='.2f',
+            numalign='right',
+            stralign='center')
+        f.write(tabulate(rows, header, **table_cfg))
+        f.write('\n```\n')
+
+
+def generate_dataset_wise_table(task, model_result_pairs, title=None):
+    dataset_rows = defaultdict(list)
+    for model, result in model_result_pairs:
+        if result.task == task:
+            dataset_rows[result.dataset].append((model, result))
+
+    if title is not None:
+        with open('modelzoo_statistics.md', 'a') as f:
+            f.write(f'\n{title}')
+    for dataset, pairs in dataset_rows.items():
+        generate_summary_table(task, pairs, title=f'### {dataset}')
+
+
+model_result_pairs = scatter_results(model_index.models)
+
+# Generate Action Recognition Summary
+generate_dataset_wise_table(
+    task='Action Recognition',
+    model_result_pairs=model_result_pairs,
+    title='## Action Recognition',
+)
+
+# Generate Action Detection Summary
+generate_dataset_wise_table(
+    task='Action Detection',
+    model_result_pairs=model_result_pairs,
+    title='## Action Detection',
+)
+
+# Generate Skeleton-based Action Recognition Summary
+generate_dataset_wise_table(
+    task='Skeleton-based Action Recognition',
+    model_result_pairs=model_result_pairs,
+    title='## Skeleton-based Action Recognition',
+)
+
+# Generate Video Retrieval Summary
+generate_dataset_wise_table(
+    task='Video Retrieval',
+    model_result_pairs=model_result_pairs,
+    title='## Video Retrieval',
+)
+
+# Generate Temporal Action Localization Summary
+generate_dataset_wise_table(
+    task='Temporal Action Localization',
+    model_result_pairs=model_result_pairs,
+    title='## Temporal Action Localization',
+)
diff --git a/docs/en/switch_language.md b/docs/en/switch_language.md
new file mode 100644
index 0000000000000000000000000000000000000000..88b3a3777af732797f98e5cba78c68808fa655c2
--- /dev/null
+++ b/docs/en/switch_language.md
@@ -0,0 +1,3 @@
+## <a href='https://mmaction2.readthedocs.io/en/latest/'>English</a>
+
+## <a href='https://mmaction2.readthedocs.io/zh_CN/latest/'>简体中文</a>
diff --git a/docs/en/useful_tools.md b/docs/en/useful_tools.md
new file mode 100644
index 0000000000000000000000000000000000000000..8805b31e00fbcfb35145ba2d8cf6c2f1cdcbf354
--- /dev/null
+++ b/docs/en/useful_tools.md
@@ -0,0 +1,92 @@
+# Useful Tools
+
+Apart from training/testing scripts, We provide lots of useful tools under the `tools/` directory.
+
+## Useful Tools Link
+
+<!-- TOC -->
+
+- [Useful Tools](#useful-tools)
+  - [Useful Tools Link](#useful-tools-link)
+  - [Model Conversion](#model-conversion)
+    - [Prepare a model for publishing](#prepare-a-model-for-publishing)
+  - [Miscellaneous](#miscellaneous)
+    - [Evaluating a metric](#evaluating-a-metric)
+    - [Print the entire config](#print-the-entire-config)
+    - [Check videos](#check-videos)
+    - [Multi-Stream Fusion](#multi-stream-fusion)
+
+<!-- TOC -->
+
+## Model Conversion
+
+### Prepare a model for publishing
+
+`tools/deployment/publish_model.py` helps users to prepare their model for publishing.
+
+Before you upload a model to AWS, you may want to:
+
+(1) convert model weights to CPU tensors.
+(2) delete the optimizer states.
+(3) compute the hash of the checkpoint file and append the hash id to the filename.
+
+```shell
+python tools/deployment/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
+```
+
+E.g.,
+
+```shell
+python tools/deployment/publish_model.py work_dirs/tsn_r50_8xb32-1x1x3-100e_kinetics400-rgb/latest.pth tsn_r50_1x1x3_100e_kinetics400_rgb.pth
+```
+
+The final output filename will be `tsn_r50_8xb32-1x1x3-100e_kinetics400-rgb-{hash id}.pth`.
+
+## Miscellaneous
+
+### Evaluating a metric
+
+`tools/analysis_tools/eval_metric.py` evaluates certain metrics of the results saved in a file according to a config file.
+
+The saved result file is created on `tools/test.py` by setting the arguments `--out ${RESULT_FILE}` to indicate the result file,
+which stores the final output of the whole model.
+
+```shell
+python tools/analysis/eval_metric.py ${CONFIG_FILE} ${RESULT_FILE} [--eval ${EVAL_METRICS}] [--cfg-options ${CFG_OPTIONS}] [--eval-options ${EVAL_OPTIONS}]
+```
+
+### Print the entire config
+
+`tools/analysis_tools/print_config.py` prints the whole config verbatim, expanding all its imports.
+
+```shell
+python tools/analysis_tools/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}]
+```
+
+### Check videos
+
+`tools/analysis_tools/check_videos.py` uses specified video encoder to iterate all samples that are specified by the input configuration file, looks for invalid videos (corrupted or missing), and saves the corresponding file path to the output file. Please note that after deleting invalid videos, users need to regenerate the video file list.
+
+```shell
+python tools/analysis_tools/check_videos.py ${CONFIG} [-h] [--options OPTIONS [OPTIONS ...]] [--cfg-options CFG_OPTIONS [CFG_OPTIONS ...]] [--output-file OUTPUT_FILE] [--split SPLIT] [--decoder DECODER] [--num-processes NUM_PROCESSES] [--remove-corrupted-videos]
+```
+
+### Multi-Stream Fusion
+
+`tools/analysis_tools/report_accuracy.py` uses the dumped results (by setting `--dump res.pkl` when testing) to fuse the multi-stream prediction scores, i.e., late fusion.
+
+```shell
+python tools/analysis_tools/report_accuracy.py [--preds ${RESULT_PKL_1 [RESULT_PKL_2 ...]}] [--coefficients ${COEFFICIENT_1 [COEFFICIENT_2, ...]}] [--apply-softmax]
+```
+
+Take joint-bone fusion as an example, which is a general practice in the task of skeleton-based action recognition.
+
+```shell
+python tools/analysis_tools/report_accuracy.py --preds demo/fuse/joint.pkl demo/fuse/bone.pkl --coefficients 1.0 1.0
+```
+
+```
+Mean Class Accuracy: 0.9180
+Top 1 Accuracy: 0.9333
+Top 5 Accuracy: 0.9833
+```
diff --git a/docs/en/user_guides/config.md b/docs/en/user_guides/config.md
new file mode 100644
index 0000000000000000000000000000000000000000..e86aab7fbf75af90ed69f0781fc3aff100ee03cc
--- /dev/null
+++ b/docs/en/user_guides/config.md
@@ -0,0 +1,706 @@
+# Learn about Configs
+
+We use python files as configs, incorporate modular and inheritance design into our config system, which is convenient to conduct various experiments.
+You can find all the provided configs under `$MMAction2/configs`. If you wish to inspect the config file,
+you may run `python tools/analysis_tools/print_config.py /PATH/TO/CONFIG` to see the complete config.
+
+<!-- TOC -->
+
+- [Learn about Configs](#learn-about-configs)
+  - [Modify config through script arguments](#modify-config-through-script-arguments)
+  - [Config File Structure](#config-file-structure)
+  - [Config File Naming Convention](#config-file-naming-convention)
+    - [Config System for Action Recognition](#config-system-for-action-recognition)
+    - [Config System for Spatio-Temporal Action Detection](#config-system-for-spatio-temporal-action-detection)
+    - [Config System for Action localization](#config-system-for-action-localization)
+
+<!-- TOC -->
+
+## Modify config through script arguments
+
+When submitting jobs using `tools/train.py` or `tools/test.py`, you may specify `--cfg-options` to in-place modify the config.
+
+- Update config keys of dict.
+
+  The config options can be specified following the order of the dict keys in the original config.
+  For example, `--cfg-options model.backbone.norm_eval=False` changes the all BN modules in model backbones to `train` mode.
+
+- Update keys inside a list of configs.
+
+  Some config dicts are composed as a list in your config. For example, the training pipeline `train_pipeline` is normally a list
+  e.g. `[dict(type='SampleFrames'), ...]`. If you want to change `'SampleFrames'` to `'DenseSampleFrames'` in the pipeline,
+  you may specify `--cfg-options train_pipeline.0.type=DenseSampleFrames`.
+
+- Update values of list/tuples.
+
+  If the value to be updated is a list or a tuple. For example, the config file normally sets `model.data_preprocessor.mean=[123.675, 116.28, 103.53]`. If you want to
+  change this key, you may specify `--cfg-options model.data_preprocessor.mean="[128,128,128]"`. Note that the quotation mark " is necessary to support list/tuple data types.
+
+## Config File Structure
+
+There are 3 basic component types under `configs/_base_`, models, schedules, default_runtime.
+Many methods could be easily constructed with one of each like TSN, I3D, SlowOnly, etc.
+The configs that are composed by components from `_base_` are called _primitive_.
+
+For all configs under the same folder, it is recommended to have only **one** _primitive_ config. All other configs should inherit from the _primitive_ config. In this way, the maximum of inheritance level is 3.
+
+For easy understanding, we recommend contributors to inherit from exiting methods.
+For example, if some modification is made based on TSN, users may first inherit the basic TSN structure by specifying `_base_ = ../tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`, then modify the necessary fields in the config files.
+
+If you are building an entirely new method that does not share the structure with any of the existing methods, you may create a folder under `configs/TASK`.
+
+Please refer to [mmengine](https://mmengine.readthedocs.io/en/latest/tutorials/config.html) for detailed documentation.
+
+## Config File Naming Convention
+
+We follow the style below to name config files. Contributors are advised to follow the same style. The config file names are divided into several parts. Logically, different parts are concatenated by underscores `'_'`, and settings in the same part are concatenated by dashes `'-'`.
+
+```
+{algorithm info}_{module info}_{training info}_{data info}.py
+```
+
+`{xxx}` is required field and `[yyy]` is optional.
+
+- `{algorithm info}`:
+  - `{model}`: model type, e.g. `tsn`, `i3d`, `swin`, `vit`, etc.
+  - `[model setting]`: specific setting for some models, e.g. `base`, `p16`, `w877`, etc.
+- `{module info}`:
+  - `[pretained info]`: pretrained information, e.g. `kinetics400-pretrained`, `in1k-pre`, etc.
+  - `{backbone}`: backbone type. e.g. `r50` (ResNet-50), etc.
+  - `[backbone setting]`: specific setting for some backbones, e.g. `nl-dot-product`, `bnfrozen`, `nopool`, etc.
+- `{training info}`:
+  - `{gpu x batch_per_gpu]}`: GPUs and samples per GPU.
+  - `{pipeline setting}`: frame sample setting, e.g. `dense`, `{clip_len}x{frame_interval}x{num_clips}`, `u48`, etc.
+  - `{schedule}`: training schedule, e.g. `coslr-20e`.
+- `{data info}`:
+  - `{dataset}`: dataset name, e.g. `kinetics400`, `mmit`, etc.
+  - `{modality}`: data modality, e.g. `rgb`, `flow`, `keypoint-2d`, etc.
+
+### Config System for Action Recognition
+
+We incorporate modular design into our config system,
+which is convenient to conduct various experiments.
+
+- An Example of TSN
+
+  To help the users have a basic idea of a complete config structure and the modules in an action recognition system,
+  we make brief comments on the config of TSN as the following.
+  For more detailed usage and alternative for per parameter in each module, please refer to the API documentation.
+
+  ```python
+  # model settings
+  model = dict(  # Config of the model
+      type='Recognizer2D',  # Class name of the recognizer
+      backbone=dict(  # Dict for backbone
+          type='ResNet',  # Name of the backbone
+          pretrained='torchvision://resnet50',  # The url/site of the pretrained model
+          depth=50,  # Depth of ResNet model
+          norm_eval=False),  # Whether to set BN layers to eval mode when training
+      cls_head=dict(  # Dict for classification head
+          type='TSNHead',  # Name of classification head
+          num_classes=400,  # Number of classes to be classified.
+          in_channels=2048,  # The input channels of classification head.
+          spatial_type='avg',  # Type of pooling in spatial dimension
+          consensus=dict(type='AvgConsensus', dim=1),  # Config of consensus module
+          dropout_ratio=0.4,  # Probability in dropout layer
+          init_std=0.01, # Std value for linear layer initiation
+          average_clips='prob'),  # Method to average multiple clip results
+      data_preprocessor=dict(  # Dict for data preprocessor
+          type='ActionDataPreprocessor',  # Name of data preprocessor
+          mean=[123.675, 116.28, 103.53],  # Mean values of different channels to normalize
+          std=[58.395, 57.12, 57.375],  # Std values of different channels to normalize
+          format_shape='NCHW'),  # Final image shape format
+      # model training and testing settings
+      train_cfg=None,  # Config of training hyperparameters for TSN
+      test_cfg=None)  # Config for testing hyperparameters for TSN.
+
+  # dataset settings
+  dataset_type = 'RawframeDataset'  # Type of dataset for training, validation and testing
+  data_root = 'data/kinetics400/rawframes_train/'  # Root path to data for training
+  data_root_val = 'data/kinetics400/rawframes_val/'  # Root path to data for validation and testing
+  ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'  # Path to the annotation file for training
+  ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'  # Path to the annotation file for validation
+  ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'  # Path to the annotation file for testing
+
+  train_pipeline = [  # Training data processing pipeline
+      dict(  # Config of SampleFrames
+          type='SampleFrames',  # Sample frames pipeline, sampling frames from video
+          clip_len=1,  # Frames of each sampled output clip
+          frame_interval=1,  # Temporal interval of adjacent sampled frames
+          num_clips=3),  # Number of clips to be sampled
+      dict(  # Config of RawFrameDecode
+          type='RawFrameDecode'),  # Load and decode Frames pipeline, picking raw frames with given indices
+      dict(  # Config of Resize
+          type='Resize',  # Resize pipeline
+          scale=(-1, 256)),  # The scale to resize images
+      dict(  # Config of MultiScaleCrop
+          type='MultiScaleCrop',  # Multi scale crop pipeline, cropping images with a list of randomly selected scales
+          input_size=224,  # Input size of the network
+          scales=(1, 0.875, 0.75, 0.66),  # Scales of width and height to be selected
+          random_crop=False,  # Whether to randomly sample cropping bbox
+          max_wh_scale_gap=1),  # Maximum gap of w and h scale levels
+      dict(  # Config of Resize
+          type='Resize',  # Resize pipeline
+          scale=(224, 224),  # The scale to resize images
+          keep_ratio=False),  # Whether to resize with changing the aspect ratio
+      dict(  # Config of Flip
+          type='Flip',  # Flip Pipeline
+          flip_ratio=0.5),  # Probability of implementing flip
+      dict(  # Config of FormatShape
+          type='FormatShape',  # Format shape pipeline, Format final image shape to the given input_format
+          input_format='NCHW'),  # Final image shape format
+      dict(type='PackActionInputs')  # Config of PackActionInputs
+  ]
+  val_pipeline = [  # Validation data processing pipeline
+      dict(  # Config of SampleFrames
+          type='SampleFrames',  # Sample frames pipeline, sampling frames from video
+          clip_len=1,  # Frames of each sampled output clip
+          frame_interval=1,  # Temporal interval of adjacent sampled frames
+          num_clips=3,  # Number of clips to be sampled
+          test_mode=True),  # Whether to set test mode in sampling
+      dict(  # Config of RawFrameDecode
+          type='RawFrameDecode'),  # Load and decode Frames pipeline, picking raw frames with given indices
+      dict(  # Config of Resize
+          type='Resize',  # Resize pipeline
+          scale=(-1, 256)),  # The scale to resize images
+      dict(  # Config of CenterCrop
+          type='CenterCrop',  # Center crop pipeline, cropping the center area from images
+          crop_size=224),  # The size to crop images
+      dict(  # Config of Flip
+          type='Flip',  # Flip pipeline
+          flip_ratio=0),  # Probability of implementing flip
+      dict(  # Config of FormatShape
+          type='FormatShape',  # Format shape pipeline, Format final image shape to the given input_format
+          input_format='NCHW'),  # Final image shape format
+      dict(type='PackActionInputs')  # Config of PackActionInputs
+  ]
+  test_pipeline = [  # Testing data processing pipeline
+      dict(  # Config of SampleFrames
+          type='SampleFrames',  # Sample frames pipeline, sampling frames from video
+          clip_len=1,  # Frames of each sampled output clip
+          frame_interval=1,  # Temporal interval of adjacent sampled frames
+          num_clips=25,  # Number of clips to be sampled
+          test_mode=True),  # Whether to set test mode in sampling
+      dict(  # Config of RawFrameDecode
+          type='RawFrameDecode'),  # Load and decode Frames pipeline, picking raw frames with given indices
+      dict(  # Config of Resize
+          type='Resize',  # Resize pipeline
+          scale=(-1, 256)),  # The scale to resize images
+      dict(  # Config of TenCrop
+          type='TenCrop',  # Ten crop pipeline, cropping ten area from images
+          crop_size=224),  # The size to crop images
+      dict(  # Config of Flip
+          type='Flip',  # Flip pipeline
+          flip_ratio=0),  # Probability of implementing flip
+      dict(  # Config of FormatShape
+          type='FormatShape',  # Format shape pipeline, Format final image shape to the given input_format
+          input_format='NCHW'),  # Final image shape format
+      dict(type='PackActionInputs')  # Config of PackActionInputs
+  ]
+
+  train_dataloader = dict(  # Config of train dataloader
+      batch_size=32,  # Batch size of each single GPU during training
+      num_workers=8,  # Workers to pre-fetch data for each single GPU during training
+      persistent_workers=True,  # If `True`, the dataloader will not shut down the worker processes after an epoch end, which can accelerate training speed
+      sampler=dict(
+          type='DefaultSampler',  # DefaultSampler which supports both distributed and non-distributed training. Refer to https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py
+          shuffle=True),  # Randomly shuffle the training data in each epoch
+      dataset=dict(  # Config of train dataset
+          type=dataset_type,
+          ann_file=ann_file_train,  # Path of annotation file
+          data_prefix=dict(img=data_root),  # Prefix of frame path
+          pipeline=train_pipeline))
+  val_dataloader = dict(  # Config of validation dataloader
+      batch_size=1,  # Batch size of each single GPU during validation
+      num_workers=8,  # Workers to pre-fetch data for each single GPU during validation
+      persistent_workers=True,  # If `True`, the dataloader will not shut down the worker processes after an epoch end
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # Not shuffle during validation and testing
+      dataset=dict(  # Config of validation dataset
+          type=dataset_type,
+          ann_file=ann_file_val,  # Path of annotation file
+          data_prefix=dict(img=data_root_val),  # Prefix of frame path
+          pipeline=val_pipeline,
+          test_mode=True))
+  test_dataloader = dict(  # Config of test dataloader
+      batch_size=32,  # Batch size of each single GPU during testing
+      num_workers=8,  # Workers to pre-fetch data for each single GPU during testing
+      persistent_workers=True,  # If `True`, the dataloader will not shut down the worker processes after an epoch end
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # Not shuffle during validation and testing
+      dataset=dict(  # Config of test dataset
+          type=dataset_type,
+          ann_file=ann_file_val,  # Path of annotation file
+          data_prefix=dict(img=data_root_val),  # Prefix of frame path
+          pipeline=test_pipeline,
+          test_mode=True))
+
+  # evaluation settings
+  val_evaluator = dict(type='AccMetric')  # Config of validation evaluator
+  test_evaluator = val_evaluator  # Config of testing evaluator
+
+  train_cfg = dict(  # Config of training loop
+      type='EpochBasedTrainLoop',  # Name of training loop
+      max_epochs=100,  # Total training epochs
+      val_begin=1,  # The epoch that begins validating
+      val_interval=1)  # Validation interval
+  val_cfg = dict(  # Config of validation loop
+      type='ValLoop')  # Name of validation loop
+  test_cfg = dict( # Config of testing loop
+      type='TestLoop')  # Name of testing loop
+
+  # learning policy
+  param_scheduler = [  # Parameter scheduler for updating optimizer parameters, support dict or list
+      dict(type='MultiStepLR',  # Decays the learning rate once the number of epoch reaches one of the milestones
+          begin=0,  # Step at which to start updating the learning rate
+          end=100,  # Step at which to stop updating the learning rate
+          by_epoch=True,  # Whether the scheduled learning rate is updated by epochs
+          milestones=[40, 80],  # Steps to decay the learning rate
+          gamma=0.1)]  # Multiplicative factor of learning rate decay
+
+  # optimizer
+  optim_wrapper = dict(  # Config of optimizer wrapper
+      type='OptimWrapper',  # Name of optimizer wrapper, switch to AmpOptimWrapper to enable mixed precision training
+      optimizer=dict(  # Config of optimizer. Support all kinds of optimizers in PyTorch. Refer to https://pytorch.org/docs/stable/optim.html#algorithms
+          type='SGD',  # Name of optimizer
+          lr=0.01,  # Learning rate
+          momentum=0.9,  # Momentum factor
+          weight_decay=0.0001),  # Weight decay
+      clip_grad=dict(max_norm=40, norm_type=2))  # Config of gradient clip
+
+  # runtime settings
+  default_scope = 'mmaction'  # The default registry scope to find modules. Refer to https://mmengine.readthedocs.io/en/latest/tutorials/registry.html
+  default_hooks = dict(  # Hooks to execute default actions like updating model parameters and saving checkpoints.
+      runtime_info=dict(type='RuntimeInfoHook'),  # The hook to updates runtime information into message hub
+      timer=dict(type='IterTimerHook'),  # The logger used to record time spent during iteration
+      logger=dict(
+          type='LoggerHook',  # The logger used to record logs during training/validation/testing phase
+          interval=20,  # Interval to print the log
+          ignore_last=False), # Ignore the log of last iterations in each epoch
+      param_scheduler=dict(type='ParamSchedulerHook'),  # The hook to update some hyper-parameters in optimizer
+      checkpoint=dict(
+          type='CheckpointHook',  # The hook to save checkpoints periodically
+          interval=3,  # The saving period
+          save_best='auto',  # Specified metric to mearsure the best checkpoint during evaluation
+          max_keep_ckpts=3),  # The maximum checkpoints to keep
+      sampler_seed=dict(type='DistSamplerSeedHook'),  # Data-loading sampler for distributed training
+      sync_buffers=dict(type='SyncBuffersHook'))  # Synchronize model buffers at the end of each epoch
+  env_cfg = dict(  # Dict for setting environment
+      cudnn_benchmark=False,  # Whether to enable cudnn benchmark
+      mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # Parameters to setup multiprocessing
+      dist_cfg=dict(backend='nccl')) # Parameters to setup distributed environment, the port can also be set
+
+  log_processor = dict(
+      type='LogProcessor',  # Log processor used to format log information
+      window_size=20,  # Default smooth interval
+      by_epoch=True)  # Whether to format logs with epoch type
+  vis_backends = [  # List of visualization backends
+      dict(type='LocalVisBackend')]  # Local visualization backend
+  visualizer = dict(  # Config of visualizer
+      type='ActionVisualizer',  # Name of visualizer
+      vis_backends=vis_backends)
+  log_level = 'INFO'  # The level of logging
+  load_from = None  # Load model checkpoint as a pre-trained model from a given path. This will not resume training.
+  resume = False  # Whether to resume from the checkpoint defined in `load_from`. If `load_from` is None, it will resume the latest checkpoint in the `work_dir`.
+  ```
+
+### Config System for Spatio-Temporal Action Detection
+
+We incorporate modular design into our config system, which is convenient to conduct various experiments.
+
+- An Example of FastRCNN
+
+  To help the users have a basic idea of a complete config structure and the modules in a spatio-temporal action detection system,
+  we make brief comments on the config of FastRCNN as the following.
+  For more detailed usage and alternative for per parameter in each module, please refer to the API documentation.
+
+  ```python
+  # model setting
+  model = dict(  # Config of the model
+      type='FastRCNN',  # Class name of the detector
+      _scope_='mmdet',  # The scope of current config
+      backbone=dict(  # Dict for backbone
+          type='ResNet3dSlowOnly',  # Name of the backbone
+          depth=50, # Depth of ResNet model
+          pretrained=None,   # The url/site of the pretrained model
+          pretrained2d=False, # If the pretrained model is 2D
+          lateral=False,  # If the backbone is with lateral connections
+          num_stages=4, # Stages of ResNet model
+          conv1_kernel=(1, 7, 7), # Conv1 kernel size
+          conv1_stride_t=1, # Conv1 temporal stride
+          pool1_stride_t=1, # Pool1 temporal stride
+          spatial_strides=(1, 2, 2, 1)),  # The spatial stride for each ResNet stage
+      roi_head=dict(  # Dict for roi_head
+          type='AVARoIHead',  # Name of the roi_head
+          bbox_roi_extractor=dict(  # Dict for bbox_roi_extractor
+              type='SingleRoIExtractor3D',  # Name of the bbox_roi_extractor
+              roi_layer_type='RoIAlign',  # Type of the RoI op
+              output_size=8,  # Output feature size of the RoI op
+              with_temporal_pool=True), # If temporal dim is pooled
+          bbox_head=dict( # Dict for bbox_head
+              type='BBoxHeadAVA', # Name of the bbox_head
+              in_channels=2048, # Number of channels of the input feature
+              num_classes=81, # Number of action classes + 1
+              multilabel=True,  # If the dataset is multilabel
+              dropout_ratio=0.5),  # The dropout ratio used
+      data_preprocessor=dict(  # Dict for data preprocessor
+          type='ActionDataPreprocessor',  # Name of data preprocessor
+          mean=[123.675, 116.28, 103.53],  # Mean values of different channels to normalize
+          std=[58.395, 57.12, 57.375],  # Std values of different channels to normalize
+          format_shape='NCHW')),  # Final image shape format
+      # model training and testing settings
+      train_cfg=dict(  # Training config of FastRCNN
+          rcnn=dict(  # Dict for rcnn training config
+              assigner=dict(  # Dict for assigner
+                  type='MaxIoUAssignerAVA', # Name of the assigner
+                  pos_iou_thr=0.9,  # IoU threshold for positive examples, > pos_iou_thr -> positive
+                  neg_iou_thr=0.9,  # IoU threshold for negative examples, < neg_iou_thr -> negative
+                  min_pos_iou=0.9), # Minimum acceptable IoU for positive examples
+              sampler=dict( # Dict for sample
+                  type='RandomSampler', # Name of the sampler
+                  num=32, # Batch Size of the sampler
+                  pos_fraction=1, # Positive bbox fraction of the sampler
+                  neg_pos_ub=-1,  # Upper bound of the ratio of num negative to num positive
+                  add_gt_as_proposals=True), # Add gt bboxes as proposals
+              pos_weight=1.0)),  # Loss weight of positive examples
+      test_cfg=dict(rcnn=None))  # Testing config of FastRCNN
+
+  # dataset settings
+  dataset_type = 'AVADataset' # Type of dataset for training, validation and testing
+  data_root = 'data/ava/rawframes'  # Root path to data
+  anno_root = 'data/ava/annotations'  # Root path to annotations
+
+  ann_file_train = f'{anno_root}/ava_train_v2.1.csv'  # Path to the annotation file for training
+  ann_file_val = f'{anno_root}/ava_val_v2.1.csv'  # Path to the annotation file for validation
+
+  exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'  # Path to the exclude annotation file for training
+  exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'  # Path to the exclude annotation file for validation
+
+  label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'  # Path to the label file
+
+  proposal_file_train = f'{anno_root}/ava_dense_proposals_train.FAIR.recall_93.9.pkl'  # Path to the human detection proposals for training examples
+  proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'  # Path to the human detection proposals for validation examples
+
+  train_pipeline = [  # Training data processing pipeline
+      dict(  # Config of SampleFrames
+          type='AVASampleFrames',  # Sample frames pipeline, sampling frames from video
+          clip_len=4,  # Frames of each sampled output clip
+          frame_interval=16),  # Temporal interval of adjacent sampled frames
+      dict(  # Config of RawFrameDecode
+          type='RawFrameDecode'),  # Load and decode Frames pipeline, picking raw frames with given indices
+      dict(  # Config of RandomRescale
+          type='RandomRescale',   # Randomly rescale the shortedge by a given range
+          scale_range=(256, 320)),   # The shortedge size range of RandomRescale
+      dict(  # Config of RandomCrop
+          type='RandomCrop',   # Randomly crop a patch with the given size
+          size=256),   # The size of the cropped patch
+      dict(  # Config of Flip
+          type='Flip',  # Flip Pipeline
+          flip_ratio=0.5),  # Probability of implementing flip
+      dict(  # Config of FormatShape
+          type='FormatShape',  # Format shape pipeline, Format final image shape to the given input_format
+          input_format='NCTHW',  # Final image shape format
+          collapse=True),   # Collapse the dim N if N == 1
+      dict(type='PackActionInputs') # Pack input data
+  ]
+
+  val_pipeline = [  # Validation data processing pipeline
+      dict(  # Config of SampleFrames
+          type='AVASampleFrames',  # Sample frames pipeline, sampling frames from video
+          clip_len=4,  # Frames of each sampled output clip
+          frame_interval=16),  # Temporal interval of adjacent sampled frames
+      dict(  # Config of RawFrameDecode
+          type='RawFrameDecode'),  # Load and decode Frames pipeline, picking raw frames with given indices
+      dict(  # Config of Resize
+          type='Resize',  # Resize pipeline
+          scale=(-1, 256)),  # The scale to resize images
+      dict(  # Config of FormatShape
+          type='FormatShape',  # Format shape pipeline, Format final image shape to the given input_format
+          input_format='NCTHW',  # Final image shape format
+          collapse=True),   # Collapse the dim N if N == 1
+      dict(type='PackActionInputs') # Pack input data
+  ]
+
+  train_dataloader = dict(  # Config of train dataloader
+      batch_size=32,  # Batch size of each single GPU during training
+      num_workers=8,  # Workers to pre-fetch data for each single GPU during training
+      persistent_workers=True,  # If `True`, the dataloader will not shut down the worker processes after an epoch end, which can accelerate training speed
+      sampler=dict(
+          type='DefaultSampler',  # DefaultSampler which supports both distributed and non-distributed training. Refer to https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py
+          shuffle=True),  # Randomly shuffle the training data in each epoch
+      dataset=dict(  # Config of train dataset
+          type=dataset_type,
+          ann_file=ann_file_train,  # Path of annotation file
+          exclude_file=exclude_file_train,  # Path of exclude annotation file
+          label_file=label_file,  # Path of label file
+          data_prefix=dict(img=data_root),  # Prefix of frame path
+          proposal_file=proposal_file_train,  # Path of human detection proposals
+          pipeline=train_pipeline))
+  val_dataloader = dict(  # Config of validation dataloader
+      batch_size=1,  # Batch size of each single GPU during evaluation
+      num_workers=8,  # Workers to pre-fetch data for each single GPU during evaluation
+      persistent_workers=True,  # If `True`, the dataloader will not shut down the worker processes after an epoch end
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # Not shuffle during validation and testing
+      dataset=dict(  # Config of validation dataset
+          type=dataset_type,
+          ann_file=ann_file_val,  # Path of annotation file
+          exclude_file=exclude_file_val,  # Path of exclude annotation file
+          label_file=label_file,  # Path of label file
+          data_prefix=dict(img=data_root_val),  # Prefix of frame path
+          proposal_file=proposal_file_val,  # Path of human detection proposals
+          pipeline=val_pipeline,
+          test_mode=True))
+  test_dataloader = val_dataloader  # Config of testing dataloader
+
+  # evaluation settings
+  val_evaluator = dict(  # Config of validation evaluator
+      type='AVAMetric',
+      ann_file=ann_file_val,
+      label_file=label_file,
+      exclude_file=exclude_file_val)
+  test_evaluator = val_evaluator  # Config of testing evaluator
+
+  train_cfg = dict(  # Config of training loop
+      type='EpochBasedTrainLoop',  # Name of training loop
+      max_epochs=20,  # Total training epochs
+      val_begin=1,  # The epoch that begins validating
+      val_interval=1)  # Validation interval
+  val_cfg = dict(  # Config of validation loop
+      type='ValLoop')  # Name of validation loop
+  test_cfg = dict( # Config of testing loop
+      type='TestLoop')  # Name of testing loop
+
+  # learning policy
+  param_scheduler = [ # Parameter scheduler for updating optimizer parameters, support dict or list
+      dict(type='LinearLR',  # Decays the learning rate of each parameter group by linearly changing small multiplicative factor
+          start_factor=0.1,  # The number we multiply learning rate in the first epoch
+          by_epoch=True,  # Whether the scheduled learning rate is updated by epochs
+    	  begin=0,  # Step at which to start updating the learning rate
+    	  end=5),  # Step at which to stop updating the learning rate
+      dict(type='MultiStepLR',  # Decays the learning rate once the number of epoch reaches one of the milestones
+          begin=0,  # Step at which to start updating the learning rate
+          end=20,  # Step at which to stop updating the learning rate
+          by_epoch=True,  # Whether the scheduled learning rate is updated by epochs
+          milestones=[10, 15],  # Steps to decay the learning rate
+          gamma=0.1)]  # Multiplicative factor of learning rate decay
+
+  # optimizer
+  optim_wrapper = dict(  # Config of optimizer wrapper
+      type='OptimWrapper',  # Name of optimizer wrapper, switch to AmpOptimWrapper to enable mixed precision training
+      optimizer=dict(  # Config of optimizer. Support all kinds of optimizers in PyTorch. Refer to https://pytorch.org/docs/stable/optim.html#algorithms
+          type='SGD',  # Name of optimizer
+          lr=0.2,  # Learning rate
+          momentum=0.9,  # Momentum factor
+          weight_decay=0.0001),  # Weight decay
+      clip_grad=dict(max_norm=40, norm_type=2))  # Config of gradient clip
+
+  # runtime settings
+  default_scope = 'mmaction'  # The default registry scope to find modules. Refer to https://mmengine.readthedocs.io/en/latest/tutorials/registry.html
+  default_hooks = dict(  # Hooks to execute default actions like updating model parameters and saving checkpoints.
+      runtime_info=dict(type='RuntimeInfoHook'),  # The hook to updates runtime information into message hub
+      timer=dict(type='IterTimerHook'),  # The logger used to record time spent during iteration
+      logger=dict(
+          type='LoggerHook',  # The logger used to record logs during training/validation/testing phase
+          interval=20,  # Interval to print the log
+          ignore_last=False), # Ignore the log of last iterations in each epoch
+      param_scheduler=dict(type='ParamSchedulerHook'),  # The hook to update some hyper-parameters in optimizer
+      checkpoint=dict(
+          type='CheckpointHook',  # The hook to save checkpoints periodically
+          interval=3,  # The saving period
+          save_best='auto',  # Specified metric to mearsure the best checkpoint during evaluation
+          max_keep_ckpts=3),  # The maximum checkpoints to keep
+      sampler_seed=dict(type='DistSamplerSeedHook'),  # Data-loading sampler for distributed training
+      sync_buffers=dict(type='SyncBuffersHook'))  # Synchronize model buffers at the end of each epoch
+  env_cfg = dict(  # Dict for setting environment
+      cudnn_benchmark=False,  # Whether to enable cudnn benchmark
+      mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # Parameters to setup multiprocessing
+      dist_cfg=dict(backend='nccl')) # Parameters to setup distributed environment, the port can also be set
+
+  log_processor = dict(
+      type='LogProcessor',  # Log processor used to format log information
+      window_size=20,  # Default smooth interval
+      by_epoch=True)  # Whether to format logs with epoch type
+  vis_backends = [  # List of visualization backends
+      dict(type='LocalVisBackend')]  # Local visualization backend
+  visualizer = dict(  # Config of visualizer
+      type='ActionVisualizer',  # Name of visualizer
+      vis_backends=vis_backends)
+  log_level = 'INFO'  # The level of logging
+  load_from = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+               'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/'
+               'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth')  # Load model checkpoint as a pre-trained model from a given path. This will not resume training.
+  resume = False  # Whether to resume from the checkpoint defined in `load_from`. If `load_from` is None, it will resume the latest checkpoint in the `work_dir`.
+  ```
+
+### Config System for Action localization
+
+We incorporate modular design into our config system,
+which is convenient to conduct various experiments.
+
+- An Example of BMN
+
+  To help the users have a basic idea of a complete config structure and the modules in an action localization system,
+  we make brief comments on the config of BMN as the following.
+  For more detailed usage and alternative for per parameter in each module, please refer to the [API documentation](https://mmaction2.readthedocs.io/en/latest/api.html).
+
+  ```python
+  # model settings
+  model = dict(  # Config of the model
+      type='BMN',  # Class name of the localizer
+      temporal_dim=100,  # Total frames selected for each video
+      boundary_ratio=0.5,  # Ratio for determining video boundaries
+      num_samples=32,  # Number of samples for each proposal
+      num_samples_per_bin=3,  # Number of bin samples for each sample
+      feat_dim=400,  # Dimension of feature
+      soft_nms_alpha=0.4,  # Soft NMS alpha
+      soft_nms_low_threshold=0.5,  # Soft NMS low threshold
+      soft_nms_high_threshold=0.9,  # Soft NMS high threshold
+      post_process_top_k=100)  # Top k proposals in post process
+
+  # dataset settings
+  dataset_type = 'ActivityNetDataset'  # Type of dataset for training, validation and testing
+  data_root = 'data/activitynet_feature_cuhk/csv_mean_100/'  # Root path to data for training
+  data_root_val = 'data/activitynet_feature_cuhk/csv_mean_100/'  # Root path to data for validation and testing
+  ann_file_train = 'data/ActivityNet/anet_anno_train.json'  # Path to the annotation file for training
+  ann_file_val = 'data/ActivityNet/anet_anno_val.json'  # Path to the annotation file for validation
+  ann_file_test = 'data/ActivityNet/anet_anno_test.json'  # Path to the annotation file for testing
+
+  train_pipeline = [  # Training data processing pipeline
+      dict(type='LoadLocalizationFeature'),  # Load localization feature pipeline
+      dict(type='GenerateLocalizationLabels'),  # Generate localization labels pipeline
+      dict(
+          type='PackLocalizationInputs', # Pack localization data
+          keys=('gt_bbox'), # Keys of input
+          meta_keys=('video_name'))] # Meta keys of input
+  val_pipeline = [  # Validation data processing pipeline
+      dict(type='LoadLocalizationFeature'),  # Load localization feature pipeline
+      dict(type='GenerateLocalizationLabels'),  # Generate localization labels pipeline
+      dict(
+          type='PackLocalizationInputs',  # Pack localization data
+          keys=('gt_bbox'),   # Keys of input
+          meta_keys=('video_name', 'duration_second', 'duration_frame',
+                     'annotations', 'feature_frame'))]  # Meta keys of input
+  test_pipeline = [  # Testing data processing pipeline
+      dict(type='LoadLocalizationFeature'),  # Load localization feature pipeline
+      dict(
+          type='PackLocalizationInputs',  # Pack localization data
+          keys=('gt_bbox'),  # Keys of input
+          meta_keys=('video_name', 'duration_second', 'duration_frame',
+                     'annotations', 'feature_frame'))]  # Meta keys of input
+  train_dataloader = dict(  # Config of train dataloader
+      batch_size=8,  # Batch size of each single GPU during training
+      num_workers=8,  # Workers to pre-fetch data for each single GPU during training
+      persistent_workers=True,  # If `True`, the dataloader will not shut down the worker processes after an epoch end, which can accelerate training speed
+      sampler=dict(
+          type='DefaultSampler',  # DefaultSampler which supports both distributed and non-distributed training. Refer to https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py
+          shuffle=True),  # Randomly shuffle the training data in each epoch
+      dataset=dict(  # Config of train dataset
+          type=dataset_type,
+          ann_file=ann_file_train,  # Path of annotation file
+          data_prefix=dict(video=data_root),  # Prefix of video path
+          pipeline=train_pipeline))
+  val_dataloader = dict(  # Config of validation dataloader
+      batch_size=1,  # Batch size of each single GPU during evaluation
+      num_workers=8,  # Workers to pre-fetch data for each single GPU during evaluation
+      persistent_workers=True,  # If `True`, the dataloader will not shut down the worker processes after an epoch end
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # Not shuffle during validation and testing
+      dataset=dict(  # Config of validation dataset
+          type=dataset_type,
+          ann_file=ann_file_val,  # Path of annotation file
+          data_prefix=dict(video=data_root_val),  # Prefix of video path
+          pipeline=val_pipeline,
+          test_mode=True))
+  test_dataloader = dict(  # Config of test dataloader
+      batch_size=1,  # Batch size of each single GPU during testing
+      num_workers=8,  # Workers to pre-fetch data for each single GPU during testing
+      persistent_workers=True,  # If `True`, the dataloader will not shut down the worker processes after an epoch end
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # Not shuffle during validation and testing
+      dataset=dict(  # Config of test dataset
+          type=dataset_type,
+          ann_file=ann_file_val,  # Path of annotation file
+          data_prefix=dict(video=data_root_val),  # Prefix of video path
+          pipeline=test_pipeline,
+          test_mode=True))
+
+  # evaluation settings
+  work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/'  # Directory to save the model checkpoints and logs for the current experiments
+  val_evaluator = dict(
+    type='ANetMetric',
+    metric_type='AR@AN',
+    dump_config=dict(  # Config of localization output
+        out=f'{work_dir}/results.json',  # Path to the output file
+        output_format='json'))  # File format of the output file
+  test_evaluator = val_evaluator   # Set test_evaluator as val_evaluator
+
+  max_epochs = 9  # Total epochs to train the model
+  train_cfg = dict(  # Config of training loop
+      type='EpochBasedTrainLoop',  # Name of training loop
+      max_epochs=max_epochs,  # Total training epochs
+      val_begin=1,  # The epoch that begins validating
+      val_interval=1)  # Validation interval
+  val_cfg = dict(  # Config of validation loop
+      type='ValLoop')  # Name of validating loop
+  test_cfg = dict( # Config of testing loop
+      type='TestLoop')  # Name of testing loop
+
+  # learning policy
+  param_scheduler = [  # Parameter scheduler for updating optimizer parameters, support dict or list
+      dict(type='MultiStepLR',  # Decays the learning rate once the number of epoch reaches one of the milestones
+      begin=0,  # Step at which to start updating the learning rate
+      end=max_epochs,  # Step at which to stop updating the learning rate
+      by_epoch=True,  # Whether the scheduled learning rate is updated by epochs
+      milestones=[7, ],  # Steps to decay the learning rate
+      gamma=0.1)]  # Multiplicative factor of parameter value decay
+
+  # optimizer
+  optim_wrapper = dict(  # Config of optimizer wrapper
+      type='OptimWrapper',  # Name of optimizer wrapper, switch to AmpOptimWrapper to enable mixed precision training
+      optimizer=dict(  # Config of optimizer. Support all kinds of optimizers in PyTorch. Refer to https://pytorch.org/docs/stable/optim.html#algorithms
+          type='Adam',  # Name of optimizer
+          lr=0.001,  # Learning rate
+          weight_decay=0.0001),  # Weight decay
+      clip_grad=dict(max_norm=40, norm_type=2))  # Config of gradient clip
+
+  # runtime settings
+  default_scope = 'mmaction'  # The default registry scope to find modules. Refer to https://mmengine.readthedocs.io/en/latest/tutorials/registry.html
+  default_hooks = dict(  # Hooks to execute default actions like updating model parameters and saving checkpoints.
+      runtime_info=dict(type='RuntimeInfoHook'),  # The hook to updates runtime information into message hub
+      timer=dict(type='IterTimerHook'),  # The logger used to record time spent during iteration
+      logger=dict(
+          type='LoggerHook',  # The logger used to record logs during training/validation/testing phase
+          interval=20,  # Interval to print the log
+          ignore_last=False), # Ignore the log of last iterations in each epoch
+      param_scheduler=dict(type='ParamSchedulerHook'),  # The hook to update some hyper-parameters in optimizer
+      checkpoint=dict(
+          type='CheckpointHook',  # The hook to save checkpoints periodically
+          interval=3,  # The saving period
+          save_best='auto',  # Specified metric to mearsure the best checkpoint during evaluation
+          max_keep_ckpts=3),  # The maximum checkpoints to keep
+      sampler_seed=dict(type='DistSamplerSeedHook'),  # Data-loading sampler for distributed training
+      sync_buffers=dict(type='SyncBuffersHook'))  # Synchronize model buffers at the end of each epoch
+  env_cfg = dict(  # Dict for setting environment
+      cudnn_benchmark=False,  # Whether to enable cudnn benchmark
+      mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # Parameters to setup multiprocessing
+      dist_cfg=dict(backend='nccl')) # Parameters to setup distributed environment, the port can also be set
+
+  log_processor = dict(
+      type='LogProcessor',  # Log processor used to format log information
+      window_size=20,  # Default smooth interval
+      by_epoch=True)  # Whether to format logs with epoch type
+  vis_backends = [  # List of visualization backends
+      dict(type='LocalVisBackend')]  # Local visualization backend
+  visualizer = dict(  # Config of visualizer
+      type='ActionVisualizer',  # Name of visualizer
+      vis_backends=vis_backends)
+  log_level = 'INFO'  # The level of logging
+  load_from = None  # Load model checkpoint as a pre-trained model from a given path. This will not resume training.
+  resume = False  # Whether to resume from the checkpoint defined in `load_from`. If `load_from` is None, it will resume the latest checkpoint in the `work_dir`.
+  ```
diff --git a/docs/en/user_guides/finetune.md b/docs/en/user_guides/finetune.md
new file mode 100644
index 0000000000000000000000000000000000000000..23046c589c509d720adfd9da39523647a917f18c
--- /dev/null
+++ b/docs/en/user_guides/finetune.md
@@ -0,0 +1,331 @@
+# Finetuning Models
+
+This tutorial provides instructions for users to use the pre-trained models
+to finetune them on other datasets, so that better performance can be achieved.
+
+- [Finetuning Models](#finetuning-models)
+  - [Outline](#outline)
+  - [Choose Template Config](#choose-template-config)
+  - [Modify Head](#modify-head)
+  - [Modify Dataset](#modify-dataset)
+  - [Modify Training Schedule](#modify-training-schedule)
+  - [Use Pre-Trained Model](#use-pre-trained-model)
+  - [Start Training](#start-training)
+
+## Outline
+
+There are two steps to finetune a model on a new dataset.
+
+1. Add support for the new dataset. See [Prepare Dataset](prepare_dataset.md) and [Customize Dataset](../advanced_guides/customize_dataset.md).
+2. Modify the configs. This will be discussed in this tutorial.
+
+## Choose Template Config
+
+Here, we would like to take `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` as an example. We first copy this config file to the same folder and rename it to `tsn_ucf101.py`, then four parts in the config need attention, specifically, add new keys for non-existing keys and modify the original keys for existing keys.
+
+## Modify Head
+
+The `num_classes` in the `cls_head` need to be changed to the class number of the new dataset.
+The weights of the pre-trained models are reused except for the final prediction layer.
+So it is safe to change the class number.
+In our case, UCF101 has 101 classes.
+So we change it from 400 (class number of Kinetics-400) to 101.
+
+```python
+# model settings
+model = dict(
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=101  # change from 400 to 101
+        ))
+```
+
+## Modify Dataset
+
+MMAction2 supports UCF101, Kinetics-400, Moments in Time, Multi-Moments in Time, THUMOS14,
+Something-Something V1&V2, ActivityNet Dataset.
+The users may need to adapt one of the above datasets to fit their special datasets.
+You could refer to [Prepare Dataset](prepare_dataset.md) and [Customize Dataset](../advanced_guides/customize_dataset.md) for more details.
+In our case, UCF101 is already supported by various dataset types, like `VideoDataset`,
+so we change the config as follows.
+
+```python
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/ucf101/videos_train/'
+data_root_val = 'data/ucf101/videos_val/'
+ann_file_train = 'data/ucf101/ucf101_train_list.txt'
+ann_file_val = 'data/ucf101/ucf101_val_list.txt'
+```
+
+## Modify Training Schedule
+
+Finetuning usually requires a smaller learning rate and fewer training epochs.
+
+```python
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=50,  # change from 100 to 50
+    val_begin=1,
+    val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,  # change from 100 to 50
+        by_epoch=True,
+        milestones=[20, 40],  # change milestones
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.005, # change from 0.01 to 0.005
+        momentum=0.9,
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+```
+
+## Use Pre-Trained Model
+
+To use the pre-trained model for the whole network, the new config adds the link of pre-trained models in the `load_from`.
+We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](config.md), users can directly change it by setting `load_from` in their configs.
+
+```python
+# use the pre-trained model for the whole TSN network
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'  # model path can be found in model zoo
+```
+
+## Start Training
+
+Now, we have finished the fine-tuning config file as follows:
+
+```python
+_base_ = [
+    '../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_100e.py',
+    '../../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=101  # change from 400 to 101
+        ))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/ucf101/videos_train/'
+data_root_val = 'data/ucf101/videos_val/'
+ann_file_train = 'data/ucf101/ucf101_train_list.txt'
+ann_file_val = 'data/ucf101/ucf101_val_list.txt'
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=3,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=25,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='TenCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=50,  # change from 100 to 50
+    val_begin=1,
+    val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,  # change from 100 to 50
+        by_epoch=True,
+        milestones=[20, 40],  # change milestones
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.005, # change from 0.01 to 0.005
+        momentum=0.9,
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (32 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=256)
+
+# use the pre-trained model for the whole TSN network
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'
+
+```
+
+An easier way is to inherit the kinetics400 config and only specify the modified keys. Please make sure that the custom config is in the same folder with `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`.
+
+```python
+_base_ = [
+    'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'  # inherit template config
+]
+
+# model settings
+model = dict(
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=101))  # change from 400 to 101
+
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/ucf101/videos_train/'
+data_root_val = 'data/ucf101/videos_val/'
+ann_file_train = 'data/ucf101/ucf101_train_list.txt'
+ann_file_val = 'data/ucf101/ucf101_val_list.txt'
+
+train_dataloader = dict(
+    dataset=dict(
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root)))
+val_dataloader = dict(
+    dataset=dict(
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val)))
+test_dataloader = dict(
+    dataset=dict(
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val)))
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=50,  # change from 100 to 50
+    val_begin=1,
+    val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,  # change from 100 to 50
+        by_epoch=True,
+        milestones=[20, 40],  # change milestones
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.005, # change from 0.01 to 0.005
+        momentum=0.9,
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# use the pre-trained model for the whole TSN network
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'
+
+```
+
+You can use the following command to finetune a model on your dataset.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train the TSN model on Kinetics-400 dataset in a deterministic option.
+
+```shell
+python tools/train.py configs/recognition/tsn/tsn_ucf101.py  \
+    --seed=0 --deterministic
+```
+
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](train_test.md).
diff --git a/docs/en/user_guides/inference.md b/docs/en/user_guides/inference.md
new file mode 100644
index 0000000000000000000000000000000000000000..7c8a5b63eb7ef3ea85e7276c7a912bc93a2fb591
--- /dev/null
+++ b/docs/en/user_guides/inference.md
@@ -0,0 +1,40 @@
+# Inference with existing models
+
+MMAction2 provides pre-trained models for video understanding in [Model Zoo](../modelzoo.md).
+This note will show **how to use existing models to inference on given video**.
+
+As for how to test existing models on standard datasets, please see this [guide](./train_test.md#test)
+
+## Inference on a given video
+
+MMAction2 provides high-level Python APIs for inference on a given video:
+
+- [init_recognizer](mmaction.apis.init_recognizer): Initialize a recognizer with a config and checkpoint
+- [inference_recognizer](mmaction.apis.inference_recognizer): Inference on a given video
+
+Here is an example of building the model and inference on a given video by using Kinitics-400 pre-trained checkpoint.
+
+```{note}
+If you use mmaction2 as a 3rd-party package, you need to download the conifg and the demo video in the example.
+
+Run 'mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb --dest .' to download the required config.
+
+Run 'wget https://github.com/open-mmlab/mmaction2/blob/main/demo/demo.mp4' to download the desired demo video.
+```
+
+```python
+from mmaction.apis import inference_recognizer, init_recognizer
+
+config_path = 'configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py'
+checkpoint_path = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth' # can be a local path
+img_path = 'demo/demo.mp4'   # you can specify your own picture path
+
+# build the model from a config file and a checkpoint file
+model = init_recognizer(config_path, checkpoint_path, device="cpu")  # device can be 'cuda:0'
+# test a single image
+result = inference_recognizer(model, img_path)
+```
+
+`result` is a dictionary containing `pred_scores`.
+
+An action recognition demo can be found in [demo/demo.py](https://github.com/open-mmlab/mmaction2/blob/main/demo/demo.py).
diff --git a/docs/en/user_guides/prepare_dataset.md b/docs/en/user_guides/prepare_dataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..602effa8633249db7013d619350eb85ee62b6f0b
--- /dev/null
+++ b/docs/en/user_guides/prepare_dataset.md
@@ -0,0 +1,305 @@
+# Prepare Dataset
+
+MMAction2 supports many existing datasets. In this chapter, we will lead you to prepare datasets for MMAction2.
+
+- [Prepare Dataset](#prepare-dataset)
+  - [Notes on Video Data Format](#notes-on-video-data-format)
+  - [Use built-in datasets](#use-built-in-datasets)
+  - [Use a custom dataset](#use-a-custom-dataset)
+    - [Action Recognition](#action-recognition)
+    - [Skeleton-based Action Recognition](#skeleton-based-action-recognition)
+    - [Audio-based Action Recognition](#audio-based-action-recognition)
+    - [Spatio-temporal Action Detection](#spatio-temporal-action-detection)
+    - [Temporal Action Localization](#temporal-action-localization)
+  - [Use mixed datasets for training](#use-mixed-datasets-for-training)
+    - [Repeat dataset](#repeat-dataset)
+  - [Browse dataset](#browse-dataset)
+
+## Notes on Video Data Format
+
+MMAction2 supports two types of data formats: raw frames and video. The former is widely used in previous projects such as [TSN](https://github.com/yjxiong/temporal-segment-networks).
+This is fast when SSD is available but fails to scale to the fast-growing datasets.
+(For example, the newest edition of [Kinetics](https://www.deepmind.com/open-source/kinetics) has 650K  videos and the total frames will take up several TBs.)
+The latter saves much space but has to do the computation intensive video decoding at execution time.
+To make video decoding faster, we support several efficient video loading libraries, such as [decord](https://github.com/zhreshold/decord), [PyAV](https://github.com/PyAV-Org/PyAV), etc.
+
+## Use built-in datasets
+
+MMAction2 already supports many datasets, we provide shell scripts for data preparation under the path `$MMACTION2/tools/data/`, please refer to [supported datasets](https://mmaction2.readthedocs.io/en/latest/datasetzoo_statistics.html) for details to prepare specific datasets.
+
+## Use a custom dataset
+
+The simplest way is to convert your dataset to existing dataset formats:
+
+- `RawFrameDataset` and `VideoDataset` for [Action Recognition](#action-recognition)
+- `PoseDataset` for [Skeleton-based Action Recognition](#skeleton-based-action-recognition)
+- `AudioDataset` for [Audio-based Action Recognition](#Audio-based-action-recognition)
+- `AVADataset` for [Spatio-temporal Action Detection](#spatio-temporal-action-detection)
+- `ActivityNetDataset` for [Temporal Action Localization](#temporal-action-localization)
+
+After the data pre-processing, the users need to further modify the config files to use the dataset.
+Here is an example of using a custom dataset in rawframe format.
+
+In `configs/task/method/my_custom_config.py`:
+
+```python
+...
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'path/to/your/root'
+data_root_val = 'path/to/your/root_val'
+ann_file_train = 'data/custom/custom_train_list.txt'
+ann_file_val = 'data/custom/custom_val_list.txt'
+ann_file_test = 'data/custom/custom_val_list.txt'
+...
+data = dict(
+    videos_per_gpu=32,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        ...),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        ...),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        ...))
+...
+```
+
+### Action Recognition
+
+There are two kinds of annotation files for action recognition.
+
+- rawframe annotaiton for `RawFrameDataset`
+
+  The annotation of a rawframe dataset is a text file with multiple lines,
+  and each line indicates `frame_directory` (relative path) of a video,
+  `total_frames` of a video and the `label` of a video, which are split by a whitespace.
+
+  Here is an example.
+
+  ```
+  some/directory-1 163 1
+  some/directory-2 122 1
+  some/directory-3 258 2
+  some/directory-4 234 2
+  some/directory-5 295 3
+  some/directory-6 121 3
+  ```
+
+- video annotation for `VideoDataset`
+
+  The annotation of a video dataset is a text file with multiple lines,
+  and each line indicates a sample video with the `filepath` (relative path) and `label`,
+  which are split by a whitespace.
+
+  Here is an example.
+
+  ```
+  some/path/000.mp4 1
+  some/path/001.mp4 1
+  some/path/002.mp4 2
+  some/path/003.mp4 2
+  some/path/004.mp4 3
+  some/path/005.mp4 3
+  ```
+
+### Skeleton-based Action Recognition
+
+The task recognizes the action class based on the skeleton sequence (time sequence of keypoints). We provide some methods to build your custom skeleton dataset.
+
+- Build from RGB video data
+
+  You need to extract keypoints data from video and convert it to a supported format, we provide a [tutorial](https://github.com/open-mmlab/mmaction2/tree/main/configs/skeleton/posec3d/custom_dataset_training.md) with detailed instructions.
+
+- Build from existing keypoint data
+
+  Assuming that you already have keypoint data in coco formats, you can gather them into a pickle file.
+
+  Each pickle file corresponds to an action recognition dataset. The content of a pickle file is a dictionary with two fields: `split` and `annotations`
+
+  1. Split: The value of the `split` field is a dictionary: the keys are the split names, while the values are lists of video identifiers that belong to the specific clip.
+  2. Annotations: The value of the `annotations` field is a list of skeleton annotations, each skeleton annotation is a dictionary, containing the following fields:
+     - `frame_dir` (str): The identifier of the corresponding video.
+     - `total_frames` (int): The number of frames in this video.
+     - `img_shape` (tuple\[int\]): The shape of a video frame, a tuple with two elements, in the format of `(height, width)`. Only required for 2D skeletons.
+     - `original_shape` (tuple\[int\]): Same as `img_shape`.
+     - `label` (int): The action label.
+     - `keypoint` (np.ndarray, with shape `[M x T x V x C]`): The keypoint annotation.
+       - M: number of persons;
+       - T: number of frames (same as `total_frames`);
+       - V: number of keypoints (25 for NTURGB+D 3D skeleton, 17 for CoCo, 18 for OpenPose, etc. );
+       - C: number of dimensions for keypoint coordinates (C=2 for 2D keypoint, C=3 for 3D keypoint).
+     - `keypoint_score` (np.ndarray, with shape `[M x T x V]`): The confidence score of keypoints. Only required for 2D skeletons.
+
+  Here is an example:
+
+  ```
+  {
+      "split":
+          {
+              'xsub_train':
+                  ['S001C001P001R001A001', ...],
+              'xsub_val':
+                  ['S001C001P003R001A001', ...],
+              ...
+          }
+
+      "annotations:
+          [
+              {
+                  {
+                      'frame_dir': 'S001C001P001R001A001',
+                      'label': 0,
+                      'img_shape': (1080, 1920),
+                      'original_shape': (1080, 1920),
+                      'total_frames': 103,
+                      'keypoint': array([[[[1032. ,  334.8], ...]]])
+                      'keypoint_score': array([[[0.934 , 0.9766, ...]]])
+                  },
+                  {
+                      'frame_dir': 'S001C001P003R001A001',
+                      ...
+                  },
+                  ...
+
+              }
+          ]
+  }
+  ```
+
+  Support other keypoint formats needs further modification, please refer to [customize dataset](../advanced_guides/customize_dataset.md).
+
+### Audio-based Action Recognition
+
+MMAction2 provides support for audio-based action recognition tasks utilizing the `AudioDataset`. This task employs mel spectrogram features as input. An example annotation file format is as follows:
+
+```
+ihWykL5mYRI.npy 300 153
+lumzQD42AN8.npy 240 321
+sWFRmD9Of4s.npy 250 250
+w_IpfgRsBVA.npy 300 356
+```
+
+Each line represents a training sample. Taking the first line as an example, `ihWykL5mYRI.npy` corresponds to the filename of the mel spectrogram feature. The value `300` represents the total number of frames of the original video corresponding to this mel spectrogram feature, and `153` denotes the class label. We take the following two steps to perpare the mel spectrogram feature data:
+
+First, extract `audios` from videos:
+
+```shell
+cd $MMACTION2
+python tools/data/extract_audio.py ${ROOT} ${DST_ROOT} [--ext ${EXT}] [--num-workers ${N_WORKERS}] \
+    [--level ${LEVEL}]
+```
+
+- `ROOT`: The root directory of the videos.
+- `DST_ROOT`: The destination root directory of the audios.
+- `EXT`: Extension of the video files. e.g., `mp4`.
+- `N_WORKERS`: Number of processes to be used.
+
+Next, offline generate the `mel spectrogram features` from the audios:
+
+```shell
+cd $MMACTION2
+python tools/data/build_audio_features.py ${AUDIO_HOME_PATH} ${SPECTROGRAM_SAVE_PATH} [--level ${LEVEL}] \
+    [--ext $EXT] [--num-workers $N_WORKERS] [--part $PART]
+```
+
+- `AUDIO_HOME_PATH`: The root directory of the audio files.
+- `SPECTROGRAM_SAVE_PATH`: The destination root directory of the audio features.
+- `EXT`: Extension of the audio files. e.g., `m4a`.
+- `N_WORKERS`: Number of processes to be used.
+- `PART`: Determines how many parts to be splited and which part to run. e.g., `2/5` means splitting all files into 5-fold and executing the 2nd part. This is useful if you have several machines.
+
+### Spatio-temporal Action Detection
+
+MMAction2 supports the task based on `AVADataset`. The annotation contains groundtruth bbox and proposal bbox.
+
+- groundtruth bbox
+  groundtruth bbox is a csv file with multiple lines, and each line is a detection sample of one frame, with following formats:
+
+  video_identifier, time_stamp, lt_x, lt_y, rb_x, rb_y, label, entity_id
+  each field means:
+  `video_identifier` : The identifier of the corresponding video
+  `time_stamp`: The time stamp of current frame
+  `lt_x`: The normalized x-coordinate of the left top point of bounding box
+  `lt_y`: The normalized y-coordinate of the left top point of bounding box
+  `rb_y`: The normalized x-coordinate of the right bottom point of bounding box
+  `rb_y`: The normalized y-coordinate of the right bottom point of bounding box
+  `label`: The action label
+  `entity_id`: a unique integer allowing this box to be linked to other boxes depicting the same person in adjacent frames of this video
+
+  Here is an example.
+
+  ```
+  _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,12,0
+  _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,74,0
+  ...
+  ```
+
+- proposal bbox
+  proposal bbox is a pickle file generated by a person detector, and usually needs to be fine-tuned on the target dataset. The pickle file contains a dict with below data structure:
+
+  `{'video_identifier,time_stamp': bbox_info}`
+
+  video_identifier (str): The identifier of the corresponding video
+  time_stamp (int): The time stamp of current frame
+  bbox_info (np.ndarray, with shape `[n, 5]`): Detected bbox, \<x1> \<y1> \<x2> \<y2> \<score>. x1, x2, y1, y2 are normalized with respect to frame size, which are between 0.0-1.0.
+
+### Temporal Action Localization
+
+We support Temporal Action Localization based on `ActivityNetDataset`. The annotation of ActivityNet dataset is a json file. Each key is a video name and the corresponding value is the meta data and annotation for the video.
+
+Here is an example.
+
+```
+{
+  "video1": {
+      "duration_second": 211.53,
+      "duration_frame": 6337,
+      "annotations": [
+          {
+              "segment": [
+                  30.025882995319815,
+                  205.2318595943838
+              ],
+              "label": "Rock climbing"
+          }
+      ],
+      "feature_frame": 6336,
+      "fps": 30.0,
+      "rfps": 29.9579255898
+  },
+  "video2": {...
+  }
+  ...
+}
+```
+
+## Use mixed datasets for training
+
+MMAction2 also supports to mix dataset for training. Currently it supports to repeat dataset.
+
+### Repeat dataset
+
+We use `RepeatDataset` as wrapper to repeat the dataset. For example, suppose the original dataset as `Dataset_A`,
+to repeat it, the config looks like the following
+
+```python
+dataset_A_train = dict(
+        type='RepeatDataset',
+        times=N,
+        dataset=dict(  # This is the original config of Dataset_A
+            type='Dataset_A',
+            ...
+            pipeline=train_pipeline
+        )
+    )
+```
+
+## Browse dataset
+
+coming soon...
diff --git a/docs/en/user_guides/train_test.md b/docs/en/user_guides/train_test.md
new file mode 100644
index 0000000000000000000000000000000000000000..637f90c63cccda13c4cbfaff8181bd9911a02925
--- /dev/null
+++ b/docs/en/user_guides/train_test.md
@@ -0,0 +1,252 @@
+# Training and Test
+
+- [Training and Test](#training-and-test)
+  - [Training](#training)
+    - [Training with your PC](#training-with-your-pc)
+    - [Training with multiple GPUs](#training-with-multiple-gpus)
+    - [Training with multiple machines](#training-with-multiple-machines)
+      - [Multiple machines in the same network](#multiple-machines-in-the-same-network)
+      - [Multiple machines managed with slurm](#multiple-machines-managed-with-slurm)
+  - [Test](#test)
+    - [Test with your PC](#test-with-your-pc)
+    - [Test with multiple GPUs](#test-with-multiple-gpus)
+    - [Test with multiple machines](#test-with-multiple-machines)
+      - [Multiple machines in the same network](#multiple-machines-in-the-same-network-1)
+      - [Multiple machines managed with slurm](#multiple-machines-managed-with-slurm-1)
+
+## Training
+
+### Training with your PC
+
+You can use `tools/train.py` to train a model on a single machine with a CPU and optionally a GPU.
+
+Here is the full usage of the script:
+
+```shell
+python tools/train.py ${CONFIG_FILE} [ARGS]
+```
+
+````{note}
+By default, MMAction2 prefers GPU to CPU. If you want to train a model on CPU, please empty `CUDA_VISIBLE_DEVICES` or set it to -1 to make GPU invisible to the program.
+
+```bash
+CUDA_VISIBLE_DEVICES=-1 python tools/train.py ${CONFIG_FILE} [ARGS]
+```
+````
+
+| ARGS                                  | Description                                                                                                                                                         |
+| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `CONFIG_FILE`                         | The path to the config file.                                                                                                                                        |
+| `--work-dir WORK_DIR`                 | The target folder to save logs and checkpoints. Defaults to a folder with the same name of the config file under `./work_dirs`.                                     |
+| `--resume [RESUME]`                   | Resume training. If a path is specified, resume from it, while if not specified, try to auto resume from the latest checkpoint.                                     |
+| `--amp`                               | Enable automatic-mixed-precision training.                                                                                                                          |
+| `--no-validate`                       | **Not suggested**. Disable checkpoint evaluation during training.                                                                                                   |
+| `--auto-scale-lr`                     | Auto scale the learning rate according to the actual batch size and the original batch size.                                                                        |
+| `--seed`                              | Random seed.                                                                                                                                                        |
+| `--diff-rank-seed`                    | Whether or not set different seeds for different ranks.                                                                                                             |
+| `--deterministic`                     | Whether to set deterministic options for CUDNN backend.                                                                                                             |
+| `--cfg-options CFG_OPTIONS`           | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into the config file. If the value to be overwritten is a list, it should be of the form of either `key="[a,b]"` or `key=a,b`. The argument also allows nested list/tuple values, e.g. `key="[(a,b),(c,d)]"`. Note that the quotation marks are necessary and that no white space is allowed. |
+| `--launcher {none,pytorch,slurm,mpi}` | Options for job launcher. Defaults to `none`.                                                                                                                       |
+
+### Training with multiple GPUs
+
+We provide a shell script to start a multi-GPUs task with `torch.distributed.launch`.
+
+```shell
+bash tools/dist_train.sh ${CONFIG} ${GPUS} [PY_ARGS]
+```
+
+| ARGS       | Description                                                                        |
+| ---------- | ---------------------------------------------------------------------------------- |
+| `CONFIG`   | The path to the config file.                                                       |
+| `GPUS`     | The number of GPUs to be used.                                                     |
+| `[PYARGS]` | The other optional arguments of `tools/train.py`, see [here](#train-with-your-pc). |
+
+You can also specify extra arguments of the launcher by environment variables. For example, change the
+communication port of the launcher to 29666 by the following command:
+
+```shell
+PORT=29666 bash tools/dist_train.sh ${CONFIG} ${GPUS} [PY_ARGS]
+```
+
+If you want to startup multiple training jobs and use different GPUs, you can launch them by specifying
+different port and visible devices.
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash tools/dist_train.sh ${CONFIG} 4 [PY_ARGS]
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash tools/dist_train.sh ${CONFIG} 4 [PY_ARGS]
+```
+
+### Training with multiple machines
+
+#### Multiple machines in the same network
+
+If you launch a training job with multiple machines connected with ethernet, you can run the following commands:
+
+On the first machine:
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS
+```
+
+On the second machine:
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS
+```
+
+The following extra environment variables need to be specified to train or test models with multiple machines:
+
+| ENV_VARS      | Description                                                                                           |
+| ------------- | ----------------------------------------------------------------------------------------------------- |
+| `NNODES`      | The total number of machines. Defaults to 1.                                                          |
+| `NODE_RANK`   | The index of the local machine. Defaults to 0.                                                        |
+| `PORT`        | The communication port, it should be the same in all machines. Defaults to 29500.                     |
+| `MASTER_ADDR` | The IP address of the master machine, it should be the same in all machines. Defaults to `127.0.0.1`. |
+
+Usually it is slow if you do not have high speed networking like InfiniBand.
+
+#### Multiple machines managed with slurm
+
+If you run MMAction2 on a cluster managed with [slurm](https://slurm.schedmd.com/), you can use the script `slurm_train.sh`.
+
+```shell
+[ENV_VARS] bash tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG} [PY_ARGS]
+```
+
+Here are the arguments description of the script.
+
+| ARGS        | Description                                                                        |
+| ----------- | ---------------------------------------------------------------------------------- |
+| `PARTITION` | The partition to use in your cluster.                                              |
+| `JOB_NAME`  | The name of your job, you can name it as you like.                                 |
+| `CONFIG`    | The path to the config file.                                                       |
+| `[PYARGS]`  | The other optional arguments of `tools/train.py`, see [here](#train-with-your-pc). |
+
+Here are the environment variables can be used to configure the slurm job.
+
+| ENV_VARS        | Description                                                                                                |
+| --------------- | ---------------------------------------------------------------------------------------------------------- |
+| `GPUS`          | The number of GPUs to be used. Defaults to 8.                                                              |
+| `GPUS_PER_NODE` | The number of GPUs to be allocated per node. Defaults to 8.                                                |
+| `CPUS_PER_TASK` | The number of CPUs to be allocated per task (Usually one GPU corresponds to one task). Defaults to 5.      |
+| `SRUN_ARGS`     | The other arguments of `srun`. Available options can be found [here](https://slurm.schedmd.com/srun.html). |
+
+## Test
+
+### Test with your PC
+
+You can use `tools/test.py` to test a model on a single machine with a CPU and optionally a GPU.
+
+Here is the full usage of the script:
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS]
+```
+
+````{note}
+By default, MMAction2 prefers GPU to CPU. If you want to test a model on CPU, please empty `CUDA_VISIBLE_DEVICES` or set it to -1 to make GPU invisible to the program.
+
+```bash
+CUDA_VISIBLE_DEVICES=-1 python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS]
+```
+````
+
+| ARGS                                  | Description                                                                                                                                                         |
+| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `CONFIG_FILE`                         | The path to the config file.                                                                                                                                        |
+| `CHECKPOINT_FILE`                     | The path to the checkpoint file (It can be a http link)                                                                                                             |
+| `--work-dir WORK_DIR`                 | The directory to save the file containing evaluation metrics. Defaults to a folder with the same name of the config file under `./work_dirs`.                       |
+| `--dump DUMP`                         | The path to dump all outputs of the model for offline evaluation.                                                                                                   |
+| `--cfg-options CFG_OPTIONS`           | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into the config file. If the value to be overwritten is a list, it should be of the form of either `key="[a,b]"` or `key=a,b`. The argument also allows nested list/tuple values, e.g. `key="[(a,b),(c,d)]"`. Note that the quotation marks are necessary and that no white space is allowed. |
+| `--show-dir SHOW_DIR`                 | The directory to save the result visualization images.                                                                                                              |
+| `--show`                              | Visualize the prediction result in a window.                                                                                                                        |
+| `--interval INTERVAL`                 | The interval of samples to visualize. Defaults to 1.                                                                                                                |
+| `--wait-time WAIT_TIME`               | The display time of every window (in seconds). Defaults to 2.                                                                                                       |
+| `--launcher {none,pytorch,slurm,mpi}` | Options for job launcher. Defaults to `none`.                                                                                                                       |
+
+### Test with multiple GPUs
+
+We provide a shell script to start a multi-GPUs task with `torch.distributed.launch`.
+
+```shell
+bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} ${GPUS} [PY_ARGS]
+```
+
+| ARGS         | Description                                                                      |
+| ------------ | -------------------------------------------------------------------------------- |
+| `CONFIG`     | The path to the config file.                                                     |
+| `CHECKPOINT` | The path to the checkpoint file (It can be a http link)                          |
+| `GPUS`       | The number of GPUs to be used.                                                   |
+| `[PYARGS]`   | The other optional arguments of `tools/test.py`, see [here](#test-with-your-pc). |
+
+You can also specify extra arguments of the launcher by environment variables. For example, change the
+communication port of the launcher to 29666 by the following command:
+
+```shell
+PORT=29666 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} ${GPUS} [PY_ARGS]
+```
+
+If you want to startup multiple test jobs and use different GPUs, you can launch them by specifying
+different port and visible devices.
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} 4 [PY_ARGS]
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} 4 [PY_ARGS]
+```
+
+### Test with multiple machines
+
+#### Multiple machines in the same network
+
+If you launch a test job with multiple machines connected with ethernet, you can run the following commands:
+
+On the first machine:
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_test.sh $CONFIG $CHECKPOINT $GPUS
+```
+
+On the second machine:
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_test.sh $CONFIG $CHECKPOINT $GPUS
+```
+
+Compared with multi-GPUs in a single machine, you need to specify some extra environment variables:
+
+| ENV_VARS      | Description                                                                                           |
+| ------------- | ----------------------------------------------------------------------------------------------------- |
+| `NNODES`      | The total number of machines. Defaults to 1.                                                          |
+| `NODE_RANK`   | The index of the local machine. Defaults to 0.                                                        |
+| `PORT`        | The communication port, it should be the same in all machines. Defaults to 29500.                     |
+| `MASTER_ADDR` | The IP address of the master machine, it should be the same in all machines. Defaults to `127.0.0.1`. |
+
+Usually it is slow if you do not have high speed networking like InfiniBand.
+
+#### Multiple machines managed with slurm
+
+If you run MMAction2 on a cluster managed with [slurm](https://slurm.schedmd.com/), you can use the script `slurm_test.sh`.
+
+```shell
+[ENV_VARS] bash tools/slurm_test.sh ${PARTITION} ${JOB_NAME} ${CONFIG} ${CHECKPOINT} [PY_ARGS]
+```
+
+Here are the arguments description of the script.
+
+| ARGS         | Description                                                                      |
+| ------------ | -------------------------------------------------------------------------------- |
+| `PARTITION`  | The partition to use in your cluster.                                            |
+| `JOB_NAME`   | The name of your job, you can name it as you like.                               |
+| `CONFIG`     | The path to the config file.                                                     |
+| `CHECKPOINT` | The path to the checkpoint file (It can be a http link)                          |
+| `[PYARGS]`   | The other optional arguments of `tools/test.py`, see [here](#test-with-your-pc). |
+
+Here are the environment variables can be used to configure the slurm job.
+
+| ENV_VARS        | Description                                                                                                |
+| --------------- | ---------------------------------------------------------------------------------------------------------- |
+| `GPUS`          | The number of GPUs to be used. Defaults to 8.                                                              |
+| `GPUS_PER_NODE` | The number of GPUs to be allocated per node. Defaults to 8.                                                |
+| `CPUS_PER_TASK` | The number of CPUs to be allocated per task (Usually one GPU corresponds to one task). Defaults to 5.      |
+| `SRUN_ARGS`     | The other arguments of `srun`. Available options can be found [here](https://slurm.schedmd.com/srun.html). |
diff --git a/docs/en/utils.py b/docs/en/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..444e4c147d19d3f20686c81233d7ffc6e0821c19
--- /dev/null
+++ b/docs/en/utils.py
@@ -0,0 +1,28 @@
+import re
+from pathlib import Path
+
+
+def replace_link(pattern, template, content, file_path):
+    MMACT_ROOT = Path(__file__).absolute().parents[2]
+    GITHUB_PREFIX = 'https://github.com/open-mmlab/mmaction2/blob/main/'
+
+    def replace_core(matchobj):
+        name = matchobj.group(1)
+        link = matchobj.group(2)
+        if link.startswith('http') or link.startswith('#'):
+            return template.format(name, link)
+        # For link relative to project folder, such as '/configs/*/*.py'
+        elif Path(link).is_absolute():
+            link = link.lstrip('/')
+            folder = MMACT_ROOT
+        # For link relative to current file, such as './config/*.py'
+        else:
+            folder = file_path.parent
+        file_link = link.split('#')[0]
+        assert (folder / file_link).exists(), \
+            f'Link not found:\n{file_path}: {folder / link}'
+        rel_link = (folder / link).resolve().relative_to(MMACT_ROOT)
+        link = GITHUB_PREFIX + str(rel_link)
+        return template.format(name, link)
+
+    return re.sub(pattern, replace_core, content)
diff --git a/docs/zh_cn/Makefile b/docs/zh_cn/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..73a28c7134cd1760744f34bac4ebdedfbed40f72
--- /dev/null
+++ b/docs/zh_cn/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/zh_cn/_static/css/readthedocs.css b/docs/zh_cn/_static/css/readthedocs.css
new file mode 100644
index 0000000000000000000000000000000000000000..55b3d3f8ffde0fd0e9d00e7f8b73124bba6cfe2d
--- /dev/null
+++ b/docs/zh_cn/_static/css/readthedocs.css
@@ -0,0 +1,62 @@
+.header-logo {
+    background-image: url("../images/logo.png");
+    background-size: 130px 40px;
+    height: 40px;
+    width: 130px;
+}
+
+@media screen and (min-width: 1100px) {
+    .header-logo {
+      top: -12px;
+    }
+  }
+
+  pre {
+      white-space: pre;
+  }
+
+  @media screen and (min-width: 2000px) {
+    .pytorch-content-left {
+      width: 1200px;
+      margin-left: 30px;
+    }
+    article.pytorch-article {
+      max-width: 1200px;
+    }
+    .pytorch-breadcrumbs-wrapper {
+      width: 1200px;
+    }
+    .pytorch-right-menu.scrolling-fixed {
+      position: fixed;
+      top: 45px;
+      left: 1580px;
+    }
+  }
+
+
+  article.pytorch-article section code {
+    padding: .2em .4em;
+    background-color: #f3f4f7;
+    border-radius: 5px;
+  }
+
+  /* Disable the change in tables */
+  article.pytorch-article section table code {
+    padding: unset;
+    background-color: unset;
+    border-radius: unset;
+  }
+
+  table.autosummary td {
+    width: 50%
+  }
+
+  img.align-center {
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+  }
+
+  article.pytorch-article p.rubric {
+    font-weight: bold;
+  }
diff --git a/docs/zh_cn/_static/images/logo.png b/docs/zh_cn/_static/images/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..f0c759bb78c5424b4394d18a5ba833a8c9f43add
Binary files /dev/null and b/docs/zh_cn/_static/images/logo.png differ
diff --git a/docs/zh_cn/_static/js/custom.js b/docs/zh_cn/_static/js/custom.js
new file mode 100644
index 0000000000000000000000000000000000000000..6afd7b02a2acfc8d608ff935f9951597b9310c11
--- /dev/null
+++ b/docs/zh_cn/_static/js/custom.js
@@ -0,0 +1,20 @@
+var collapsedSections = ['数据集支持'];
+
+$(document).ready(function () {
+  $('.model-summary').DataTable({
+    "stateSave": false,
+    "lengthChange": false,
+    "pageLength": 20,
+    "order": [],
+    "language": {
+      "info": "显示 _START_ 至 _END_ 条目（总计 _TOTAL_ ）",
+      "infoFiltered": "（筛选自 _MAX_ 条目）",
+      "search": "搜索：",
+      "zeroRecords": "没有找到任何条目",
+      "paginate": {
+        "next": "下一页",
+        "previous": "上一页"
+      },
+    }
+  });
+});
diff --git a/docs/zh_cn/_templates/404.html b/docs/zh_cn/_templates/404.html
new file mode 100644
index 0000000000000000000000000000000000000000..dfdc4c33992e96876205a84e184130cd6806c7f8
--- /dev/null
+++ b/docs/zh_cn/_templates/404.html
@@ -0,0 +1,16 @@
+{% extends "layout.html" %}
+
+{% block body %}
+
+<h1>未找到页面</h1>
+<p>
+  未找到你要打开的页面。
+</p>
+<p>
+  如果你是从旧版本文档跳转至此，可能是对应的页面被移动了。请从左侧的目录中寻找新版本文档，或者跳转至<a href="{{ pathto(root_doc) }}">首页</a>。
+</p>
+<p>
+  如果你找不到希望打开的文档，欢迎在 <a href="https://github.com/open-mmlab/mmaction2/issues/new/choose">Issue</a> 中告诉我们！
+</p>
+
+{% endblock %}
diff --git a/docs/zh_cn/advanced_guides/customize_dataset.md b/docs/zh_cn/advanced_guides/customize_dataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..3aa87119386802b3ae9e5d5ddb0835642484bc80
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/customize_dataset.md
@@ -0,0 +1,126 @@
+# 自定义数据集
+
+在本教程中，我们将介绍如何通过在线转换来自定义你的数据集。
+
+- [自定义数据集](#自定义数据集)
+  - [MMAction2 数据集概述](#mmaction2-数据集概述)
+  - [定制新的数据集](#定制新的数据集)
+  - [为 PoseDataset 自定义关键点格式](#为-posedataset-自定义关键点格式)
+
+## MMAction2 数据集概述
+
+MMAction2 提供了任务特定的 `Dataset` 类，例如用于动作识别的 `VideoDataset`/`RawframeDataset`，用于时空动作检测的 `AVADataset`，用于基于骨骼的动作识别的`PoseDataset`。这些任务特定的数据集只需要实现 `load_data_list(self)` 来从注释文件生成数据列表。剩下的函数由超类（即 `BaseActionDataset` 和 `BaseDataset`）自动处理。下表显示了模块的继承关系和主要方法。
+
+| 类名                           | 类方法                                                                                                                                                        |
+| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `MMAction2::VideoDataset`      | `load_data_list(self)` <br> 从注释文件中构建数据列表。                                                                                                        |
+| `MMAction2::BaseActionDataset` | `get_data_info(self, idx)` <br> 给定 `idx`，从数据列表中返回相应的数据样本。                                                                                  |
+| `MMEngine::BaseDataset`        | `__getitem__(self, idx)` <br> 给定 `idx`，调用 `get_data_info` 获取数据样本，然后调用 `pipeline` 在 `train_pipeline` 或 `val_pipeline` 中执行数据变换和增强。 |
+
+## 定制新的数据集类
+
+大多数情况下，把你的数据集离线转换成指定格式是首选方法，但 MMAction2 提供了一个方便的过程来创建一个定制的 `Dataset` 类。如前所述，任务特定的数据集只需要实现 `load_data_list(self)` 来从注释文件生成数据列表。请注意，`data_list` 中的元素是包含后续流程中必要字段的 `dict`。
+
+以 `VideoDataset` 为例，`train_pipeline`/`val_pipeline` 在 `DecordInit` 中需要 `'filename'`，在 `PackActionInputs` 中需要 `'label'`。因此，`data_list` 中的数据样本必须包含2个字段：`'filename'`和`'label'`。
+请参考[定制数据流水线](customize_pipeline.md)以获取有关 `pipeline` 的更多详细信息。
+
+```
+data_list.append(dict(filename=filename, label=label))
+```
+
+`AVADataset` 会更加复杂，`data_list` 中的数据样本包含有关视频数据的几个字段。此外，它重写了 `get_data_info(self, idx)` 以转换在时空动作检测数据流水线中需要用的字段。
+
+```python
+
+class AVADataset(BaseActionDataset):
+  ...
+
+   def load_data_list(self) -> List[dict]:
+      ...
+        video_info = dict(
+            frame_dir=frame_dir,
+            video_id=video_id,
+            timestamp=int(timestamp),
+            img_key=img_key,
+            shot_info=shot_info,
+            fps=self._FPS,
+            ann=ann)
+            data_list.append(video_info)
+        data_list.append(video_info)
+      return data_list
+
+  def get_data_info(self, idx: int) -> dict:
+      ...
+      ann = data_info.pop('ann')
+      data_info['gt_bboxes'] = ann['gt_bboxes']
+      data_info['gt_labels'] = ann['gt_labels']
+      data_info['entity_ids'] = ann['entity_ids']
+      return data_info
+```
+
+## 为 PoseDataset 自定义关键点格式
+
+MMAction2 目前支持三种关键点格式：`coco`，`nturgb+d` 和 `openpose`。如果你使用其中一种格式，你可以简单地在以下模块中指定相应的格式：
+
+对于图卷积网络，如 AAGCN，STGCN，...
+
+- `pipeline`：在 `JointToBone` 中的参数 `dataset`。
+- `backbone`：在图卷积网络中的参数 `graph_cfg`。
+
+对于 PoseC3D：
+
+- `pipeline`：在 `Flip` 中，根据关键点的对称关系指定 `left_kp` 和 `right_kp`。
+- `pipeline`：在 `GeneratePoseTarget` 中，如果 `with_limb` 为 `True`，指定`skeletons`，`left_limb`，`right_limb`，如果 `with_kp` 为 `True`，指定`left_kp` 和 `right_kp`。
+
+如果使用自定义关键点格式，需要在 `backbone` 和 `pipeline` 中都包含一个新的图布局。这个布局将定义关键点及其连接关系。
+
+以 `coco` 数据集为例，我们在 `Graph` 中定义了一个名为 `coco` 的布局。这个布局的 `inward` 连接包括所有节点连接，每个**向心**连接由一个节点元组组成。`coco`的额外设置包括将节点数指定为 `17`，将 `node 0` 设为中心节点。
+
+```python
+
+self.num_node = 17
+self.inward = [(15, 13), (13, 11), (16, 14), (14, 12), (11, 5),
+                (12, 6), (9, 7), (7, 5), (10, 8), (8, 6), (5, 0),
+                (6, 0), (1, 0), (3, 1), (2, 0), (4, 2)]
+self.center = 0
+```
+
+同样，我们在 `JointToBone` 中定义了 `pairs`，添加了一个 bone `(0, 0)` 以使 bone 的数量对齐到 joint。coco数据集的 `pairs` 如下所示，`JointToBone` 中的 `pairs` 的顺序无关紧要。
+
+```python
+
+self.pairs = ((0, 0), (1, 0), (2, 0), (3, 1), (4, 2),
+                (5, 0), (6, 0), (7, 5), (8, 6), (9, 7),
+                (10, 8), (11, 0), (12, 0), (13, 11), (14, 12),
+                (15, 13), (16, 14))
+```
+
+要使用你的自定义关键点格式，只需定义上述设置为你的图结构，并在你的配置文件中指定它们，如下所示。在这个例子中，我们将使用 `STGCN`，其中 `n` 表示类别的数量，`custom_dataset` 在 `Graph` 和 `JointToBone` 中定义。
+
+```python
+model = dict(
+  type='RecognizerGCN',
+  backbone=dict(
+      type='STGCN', graph_cfg=dict(layout='custom_dataset', mode='stgcn_spatial')),
+  cls_head=dict(type='GCNHead', num_classes=n, in_channels=256))
+
+train_pipeline = [
+  ...
+  dict(type='GenSkeFeat', dataset='custom_dataset'),
+  ...]
+
+val_pipeline = [
+  ...
+  dict(type='GenSkeFeat', dataset='custom_dataset'),
+  ...]
+
+test_pipeline = [
+  ...
+  dict(type='GenSkeFeat', dataset='custom_dataset'),
+  ...]
+
+```
+
+只需简单地指定自定义布局，你就可以使用你自己的关键点格式进行训练和测试了。通过这种方式，MMAction2 为用户提供了很大的灵活性，允许用户自定义他们的数据集和关键点格式，以满足他们特定的需求。
+
+以上就是关于如何自定义你的数据集的一些方法。希望这个教程能帮助你理解MMAction2的数据集结构，并教给你如何根据自己的需求创建新的数据集。虽然这可能需要一些编程知识，但是 MMAction2 试图使这个过程尽可能简单。通过了解这些基本概念，你将能够更好地控制你的数据，从而改进你的模型性能。
diff --git a/docs/zh_cn/advanced_guides/customize_logging.md b/docs/zh_cn/advanced_guides/customize_logging.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0283ae4d8cad19f340c12b3cfaa0146b57fc613
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/customize_logging.md
@@ -0,0 +1,163 @@
+# 自定义日志
+
+MMAction2 在运行过程中会产生大量的日志，如损失、迭代时间、学习率等。在这一部分，我们将向你介绍如何输出自定义日志。有关日志系统的更多详细信息，请参考 [MMEngine 教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/logging.html)。
+
+- [自定义日志](#自定义日志)
+  - [灵活的日志系统](#灵活的日志系统)
+  - [定制日志](#定制日志)
+  - [导出调试日志](#导出调试日志)
+
+## 灵活的日志系统
+
+默认情况下，MMAction2 的日志系统由 [default_runtime](/configs/_base_/default_runtime.py) 中的 `LogProcessor` 配置：
+
+```python
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+```
+
+默认情况下，`LogProcessor` 捕获 `model.forward` 返回的所有以 `loss` 开头的字段。例如，在以下模型中，`loss1` 和 `loss2` 将在没有任何额外配置的情况下自动记录到日志。
+
+```python
+from mmengine.model import BaseModel
+
+class ToyModel(BaseModel):
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear = nn.Linear(1, 1)
+
+    def forward(self, img, label, mode):
+        feat = self.linear(img)
+        loss1 = (feat - label).pow(2)
+        loss2 = (feat - label).abs()
+        return dict(loss1=loss1, loss2=loss2)
+```
+
+输出日志遵循以下格式：
+
+```
+08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][10/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0019  data_time: 0.0004  loss1: 0.8381  loss2: 0.9007  loss: 1.7388
+08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][20/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0029  data_time: 0.0010  loss1: 0.1978  loss2: 0.4312  loss: 0.6290
+```
+
+`LogProcessor` 将按以下格式输出日志：
+
+- 日志的前缀：
+  - epoch 模式(`by_epoch=True`)：`Epoch(train) [{current_epoch}/{current_iteration}]/{dataloader_length}`
+  - iteration 模式(`by_epoch=False`)：`Iter(train) [{current_iteration}/{max_iteration}]`
+- 学习率 (`lr`)：最后一次迭代的学习率。
+- 时间：
+  - `time`：过去 `window_size` 次迭代的推理平均时间。
+  - `data_time`：过去 `window_size` 次迭代的数据加载平均时间。
+  - `eta`：完成训练的预计到达时间。
+- 损失：过去 `window_size` 次迭代中模型输出的平均损失。
+
+```{warning}
+默认情况下，log_processor 输出基于 epoch 的日志(`by_epoch=True`)。要得到与 `train_cfg` 匹配的预期日志，我们应在 `train_cfg` 和 `log_processor` 中设置相同的 `by_epoch` 值。
+```
+
+根据以上规则，代码片段将每20次迭代计算 loss1 和 loss2 的平均值。更多类型的统计方法，请参考 [mmengine.runner.LogProcessor](mmengine.runner.LogProcessor)。
+
+## 定制日志
+
+日志系统不仅可以记录 `loss`，`lr` 等，还可以收集和输出自定义日志。例如，如果我们想要统计中间损失：
+
+`ToyModel` 在 forward 中计算 `loss_tmp`，但不将其保存到返回字典中。
+
+```python
+from mmengine.logging import MessageHub
+
+class ToyModel(BaseModel):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear = nn.Linear(1, 1)
+
+    def forward(self, img, label, mode):
+        feat = self.linear(img)
+        loss_tmp = (feat - label).abs()
+        loss = loss_tmp.pow(2)
+
+        message_hub = MessageHub.get_current_instance()
+        # 在消息中心更新中间的 `loss_tmp`
+        message_hub.update_scalar('train/loss_tmp', loss_tmp.sum())
+        return dict(loss=loss)
+```
+
+将 `loss_tmp` 添加到配置中：
+
+```python
+log_processor = dict(
+    type='LogProcessor',
+    window_size=20,
+    by_epoch=True,
+    custom_cfg=[
+        # 使用平均值统计 loss_tmp
+            dict(
+                data_src='loss_tmp',
+                window_size=20,
+                method_name='mean')
+        ])
+```
+
+`loss_tmp` 将被添加到输出日志中：
+
+```
+08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][10/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0026  data_time: 0.0008  loss_tmp: 0.0097  loss: 0.0000
+08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][20/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0028  data_time: 0.0013  loss_tmp: 0.0065  loss: 0.0000
+```
+
+## 导出调试日志
+
+要将调试日志导出到 `work_dir`，你可以在配置文件中设置日志级别如下：
+
+```
+log_level='DEBUG'
+```
+
+```
+08/21 18:16:22 - mmengine - DEBUG - Get class `LocalVisBackend` from "vis_backend" registry in "mmengine"
+08/21 18:16:22 - mmengine - DEBUG - An `LocalVisBackend` instance is built from registry, its implementation can be found in mmengine.visualization.vis_backend
+08/21 18:16:22 - mmengine - DEBUG - Get class `RuntimeInfoHook` from "hook" registry in "mmengine"
+08/21 18:16:22 - mmengine - DEBUG - An `RuntimeInfoHook` instance is built from registry, its implementation can be found in mmengine.hooks.runtime_info_hook
+08/21 18:16:22 - mmengine - DEBUG - Get class `IterTimerHook` from "hook" registry in "mmengine"
+...
+```
+
+此外，如果你正在使用共享存储训练你的模型，那么在 `debug` 模式下，不同排名的日志将被保存。日志的层级结构如下：
+
+```text
+./tmp
+├── tmp.log
+├── tmp_rank1.log
+├── tmp_rank2.log
+├── tmp_rank3.log
+├── tmp_rank4.log
+├── tmp_rank5.log
+├── tmp_rank6.log
+└── tmp_rank7.log
+...
+└── tmp_rank63.log
+```
+
+在具有独立存储的多台机器上的日志：
+
+```text
+# 设备：0：
+work_dir/
+└── exp_name_logs
+    ├── exp_name.log
+    ├── exp_name_rank1.log
+    ├── exp_name_rank2.log
+    ├── exp_name_rank3.log
+    ...
+    └── exp_name_rank7.log
+
+# 设备：7：
+work_dir/
+└── exp_name_logs
+    ├── exp_name_rank56.log
+    ├── exp_name_rank57.log
+    ├── exp_name_rank58.log
+    ...
+    └── exp_name_rank63.log
+```
diff --git a/docs/zh_cn/advanced_guides/customize_models.md b/docs/zh_cn/advanced_guides/customize_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..32fc255b0097be6fe90ee656efb8f3be6dac35d1
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/customize_models.md
@@ -0,0 +1,3 @@
+# 自定义模型
+
+内容建设中...
diff --git a/docs/zh_cn/advanced_guides/customize_optimizer.md b/docs/zh_cn/advanced_guides/customize_optimizer.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e09d100a916788c7d3862f717573fc3f51407e0
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/customize_optimizer.md
@@ -0,0 +1,332 @@
+# 自定义优化器
+
+在本教程中，我们将介绍一些构建优化器和学习率策略的方法，以用于你的任务。
+
+- [自定义优化器](#自定义优化器)
+  - [使用 optim_wrapper 构建优化器](#使用-optim_wrapper-构建优化器)
+    - [使用 PyTorch 支持的优化器](#使用-pytorch-支持的优化器)
+    - [参数化精细配置](#参数化精细配置)
+    - [梯度裁剪](#梯度裁剪)
+    - [梯度累积](#梯度累积)
+  - [自定义参数策略](#自定义参数策略)
+    - [自定义学习率策略](#自定义学习率策略)
+    - [自定义动量策略](#自定义动量策略)
+  - [添加新的优化器或构造器](#添加新的优化器或构造器)
+    - [添加新的优化器](#添加新的优化器)
+      - [1. 实现一个新的优化器](#1-实现一个新的优化器)
+      - [2. 导入优化器](#2-导入优化器)
+      - [3. 在配置文件中指定优化器](#3-在配置文件中指定优化器)
+    - [添加新的优化器构造器](#添加新的优化器构造器)
+
+## 使用 optim_wrapper 构建优化器
+
+我们使用 `optim_wrapper` 字段来配置优化策略，其中包括选择优化器、参数逐个配置、梯度裁剪和梯度累积。一个简单的示例可以是：
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.0003, weight_decay=0.0001)
+)
+```
+
+在上面的示例中，我们构建了一个学习率为 0.0003，权重衰减为 0.0001 的 SGD 优化器。
+
+### 使用 PyTorch 支持的优化器
+
+我们支持 PyTorch 实现的所有优化器。要使用不同的优化器，只需更改配置文件中的 `optimizer` 字段。例如，如果想使用 `torch.optim.Adam`，可以在配置文件中进行如下修改。
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer = dict(
+        type='Adam',
+        lr=0.001,
+        betas=(0.9, 0.999),
+        eps=1e-08,
+        weight_decay=0,
+        amsgrad=False),
+)
+```
+
+首先，我们需要将 `type` 的值更改为 `torch.optim` 支持的期望优化器名称。然后，将该优化器的必要参数添加到 `optimizer` 字段中。上述配置将构建以下优化器：
+
+```python
+torch.optim.Adam(lr=0.001,
+                 betas=(0.9, 0.999),
+                 eps=1e-08,
+                 weight_decay=0,
+                 amsgrad=False)
+```
+
+### 参数化精细配置
+
+一些模型可能对优化有特定的参数设置，例如对于 BatchNorm 层不使用权重衰减，或者对不同网络层使用不同的学习率。为了对其进行细致配置，我们可以使用 `optim_wrapper` 中的 `paramwise_cfg` 参数。
+
+- **为不同类型的参数设置不同的超参数倍数。**
+
+  例如，我们可以在 `paramwise_cfg` 中设置 `norm_decay_mult=0.`，将归一化层的权重衰减设置为零。
+
+  ```python
+  optim_wrapper = dict(
+      optimizer=dict(type='SGD', lr=0.8, weight_decay=1e-4),
+      paramwise_cfg=dict(norm_decay_mult=0.))
+  ```
+
+  还支持设置其他类型的参数，包括：
+
+  - `lr_mult`：所有参数的学习率乘数。
+  - `decay_mult`：所有参数的权重衰减乘数。
+  - `bias_lr_mult`：偏置项的学习率乘数（不包括归一化层的偏置项和可变形卷积层的偏移量）。默认为 1。
+  - `bias_decay_mult`：偏置项的权重衰减乘数（不包括归一化层的偏置项和可变形卷积层的偏移量）。默认为 1。
+  - `norm_decay_mult`：归一化层权重和偏置项的权重衰减乘数。默认为 1。
+  - `dwconv_decay_mult`：深度卷积层的权重衰减乘数。默认为 1。
+  - `bypass_duplicate`：是否跳过重复的参数。默认为 `False`。
+  - `dcn_offset_lr_mult`：可变形卷积层的学习率乘数。默认为 1。
+
+- **为特定参数设置不同的超参数倍数。**
+
+  MMAction2 可以使用 `paramwise_cfg` 中的 `custom_keys` 来指定不同的参数使用不同的学习率或权重衰减。
+
+  例如，要将 `backbone.layer0` 的所有学习率和权重衰减设置为 0，而保持 `backbone` 的其余部分与优化器相同，并将 `head` 的学习率设置为 0.001，可以使用以下配置：
+
+  ```python
+  optim_wrapper = dict(
+      optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+      paramwise_cfg=dict(
+          custom_keys={
+              'backbone.layer0': dict(lr_mult=0, decay_mult=0),
+              'backbone': dict(lr_mult=1),
+              'head': dict(lr_mult=0.1)
+          }))
+  ```
+
+### 梯度裁剪
+
+在训练过程中，损失函数可能接近悬崖区域，导致梯度爆炸。梯度裁剪有助于稳定训练过程。梯度裁剪的更多介绍可以在[这个页面](https://paperswithcode.com/method/gradient-clipping)找到。
+
+目前，我们支持 `optim_wrapper` 中的 `clip_grad` 选项进行梯度裁剪，参考[PyTorch 文档](torch.nn.utils.clip_grad_norm_)。
+
+以下是一个示例：
+
+```python
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+    # norm_type: 使用的 p-范数的类型，这里 norm_type 为 2。
+    clip_grad=dict(max_norm=35, norm_type=2))
+```
+
+### 梯度累积
+
+当计算资源有限时，批量大小只能设置为较小的值，这可能会影响模型的性能。可以使用梯度累积来解决这个问题。我们支持 `optim_wrapper` 中的 `accumulative_counts` 选项进行梯度累积。
+
+以下是一个示例：
+
+```python
+train_dataloader = dict(batch_size=64)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+    accumulative_counts=4)
+```
+
+表示在训练过程中，每 4 个迭代执行一次反向传播。上述示例等价于：
+
+```python
+train_dataloader = dict(batch_size=256)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001))
+```
+
+## 新增优化器或者优化器构造器
+
+在训练中，优化参数（如学习率、动量等）通常不是固定的，而是随着迭代或周期的变化而变化。PyTorch 支持几种学习率策略，但对于复杂的策略可能不足够。在 MMAction2 中，我们提供 `param_scheduler` 来更好地控制不同参数的学习率策略。
+
+### 配置学习率调整策略
+
+调整学习率策略被广泛用于提高性能。我们支持大多数 PyTorch 学习率策略，包括 `ExponentialLR`、`LinearLR`、`StepLR`、`MultiStepLR` 等。
+
+所有可用的学习率策略可以在[这里](https://mmaction2.readthedocs.io/en/latest/schedulers.html)找到，学习率策略的名称以 `LR` 结尾。
+
+- **单一学习率策略**
+
+  在大多数情况下，我们只使用一个学习策略以简化问题。例如，`MultiStepLR` 被用作 ResNet 的默认学习率策略。在这里，`param_scheduler` 是一个字典。
+
+  ```python
+  param_scheduler = dict(
+      type='MultiStepLR',
+      by_epoch=True,
+      milestones=[100, 150],
+      gamma=0.1)
+  ```
+
+  或者，我们想使用 `CosineAnnealingLR` 策略来衰减学习率：
+
+  ```python
+  param_scheduler = dict(
+      type='CosineAnnealingLR',
+      by_epoch=True,
+      T_max=num_epochs)
+  ```
+
+- **多个学习率策略**
+
+  在某些训练案例中，为了提高准确性，会应用多个学习率策略。例如，在早期阶段，训练容易不稳定，预热是一种减少不稳定性的技术。学习率将从一个较小的值逐渐增加到预期值，通过预热进行衰减和其他策略进行衰减。
+
+  在 MMAction2 中，通过将所需的策略组合成 `param_scheduler` 的列表即可实现预热策略。
+
+  以下是一些示例：
+
+  1. 在前 50 个迭代中进行线性预热。
+
+  ```python
+    param_scheduler = [
+        # 线性预热
+        dict(type='LinearLR',
+            start_factor=0.001,
+            by_epoch=False,  # 按迭代
+            end=50),  # 仅在前 50 个迭代中进行预热
+        # 主要的学习率策略
+        dict(type='MultiStepLR',
+            by_epoch=True,
+            milestones=[8, 11],
+            gamma=0.1)
+    ]
+  ```
+
+  2. 在前 10 个周期中进行线性预热，并在每个周期内按迭代更新学习率。
+
+  ```python
+    param_scheduler = [
+        # 线性预热 [0, 10) 个周期
+        dict(type='LinearLR',
+            start_factor=0.001,
+            by_epoch=True,
+            end=10,
+            convert_to_iter_based=True,  # 按迭代更新学习率
+        ),
+        # 在 10 个周期后使用 CosineAnnealing 策略
+        dict(type='CosineAnnealingLR', by_epoch=True, begin=10)
+    ]
+  ```
+
+  注意，我们在这里使用 `begin` 和 `end` 参数来指定有效范围，该范围为 \[`begin`, `end`)。范围的单位由 `by_epoch` 参数定义。如果未指定，则 `begin` 为 0，`end` 为最大周期或迭代次数。
+
+  如果所有策略的范围都不连续，则学习率将在忽略的范围内保持不变，否则所有有效的策略将按特定阶段的顺序执行，这与 PyTorch [`ChainedScheduler`](torch.optim.lr_scheduler.ChainedScheduler) 的行为相同。
+
+### 自定义动量策略
+
+我们支持使用动量策略根据学习率修改优化器的动量，这可以使损失以更快的方式收敛。使用方法与学习率策略相同。
+
+所有可用的学习率策略可以在[这里](https://mmaction2.readthedocs.io/en/latest/schedulers.html)找到，动量策略的名称以 `Momentum` 结尾。
+
+以下是一个示例：
+
+```python
+param_scheduler = [
+    # 学习率策略
+    dict(type='LinearLR', ...),
+    # 动量策略
+    dict(type='LinearMomentum',
+         start_factor=0.001,
+         by_epoch=False,
+         begin=0,
+         end=1000)
+]
+```
+
+## 添加新的优化器或构造器
+
+本部分将修改 MMAction2 源代码或向 MMAction2 框架中添加代码，初学者可以跳过此部分。
+
+### 添加新的优化器
+
+在学术研究和工业实践中，可能需要使用 MMAction2 未实现的优化方法，可以通过以下方法进行添加。
+
+#### 1. 实现一个新的优化器
+
+假设要添加一个名为 `MyOptimizer` 的优化器，它具有参数 `a`、`b` 和 `c`。需要在 `mmaction/engine/optimizers` 下创建一个新文件，并在文件中实现新的优化器，例如在 `mmaction/engine/optimizers/my_optimizer.py` 中：
+
+```python
+from torch.optim import Optimizer
+from mmaction.registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class MyOptimizer(Optimizer):
+
+    def __init__(self, a, b, c):
+        ...
+
+    def step(self, closure=None):
+        ...
+```
+
+#### 2. 导入优化器
+
+为了找到上述定义的模块，需要在运行时导入该模块。首先，在 `mmaction/engine/optimizers/__init__.py` 中导入该模块，将其添加到 `mmaction.engine` 包中。
+
+```python
+# In mmaction/engine/optimizers/__init__.py
+...
+from .my_optimizer import MyOptimizer # MyOptimizer 可能是其他类名
+
+__all__ = [..., 'MyOptimizer']
+```
+
+在运行时，我们将自动导入 `mmaction.engine` 包，并同时注册 `MyOptimizer`。
+
+#### 3. 在配置文件中指定优化器
+
+然后，可以在配置文件的 `optim_wrapper.optimizer` 字段中使用 `MyOptimizer`。
+
+```python
+optim_wrapper = dict(
+    optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value))
+```
+
+### 添加新的优化器构造器
+
+一些模型可能对优化有一些特定的参数设置，例如所有 `BatchNorm` 层的不同权重衰减率。
+
+尽管我们已经可以使用[优化器教程](#参数化精细配置)中的 `optim_wrapper.paramwise_cfg` 字段来配置各种特定参数的优化器设置，但可能仍无法满足需求。
+
+当然，你可以修改它。默认情况下，我们使用 [`DefaultOptimWrapperConstructor`](mmengine.optim.DefaultOptimWrapperConstructor) 类来处理优化器的构造。在构造过程中，它根据 `paramwise_cfg` 对不同参数的优化器设置进行细致配置，这也可以作为新优化器构造器的模板。
+
+你可以通过添加新的优化器构造器来覆盖这些行为。
+
+```python
+# In mmaction/engine/optimizers/my_optim_constructor.py
+from mmengine.optim import DefaultOptimWrapperConstructor
+from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class MyOptimWrapperConstructor:
+
+    def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
+        ...
+
+    def __call__(self, model):
+        ...
+```
+
+然后，导入它并几乎像[优化器教程](#添加新的优化器)中那样使用它。
+
+1. 在 `mmaction/engine/optimizers/__init__.py` 中导入它，将其添加到 `mmaction.engine` 包中。
+
+   ```python
+   # In mmaction/engine/optimizers/__init__.py
+   ...
+   from .my_optim_constructor import MyOptimWrapperConstructor
+
+   __all__ = [..., 'MyOptimWrapperConstructor']
+   ```
+
+2. 在配置文件的 `optim_wrapper.constructor` 字段中使用 `MyOptimWrapperConstructor`。
+
+   ```python
+   optim_wrapper = dict(
+       constructor=dict(type='MyOptimWrapperConstructor'),
+       optimizer=...,
+       paramwise_cfg=...,
+   )
+   ```
diff --git a/docs/zh_cn/advanced_guides/customize_pipeline.md b/docs/zh_cn/advanced_guides/customize_pipeline.md
new file mode 100644
index 0000000000000000000000000000000000000000..f9fe232677775fa19a92512f318fcfe8d9ff903a
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/customize_pipeline.md
@@ -0,0 +1,144 @@
+# 自定义数据流水线
+
+在本教程中，我们将介绍如何为你的任务构建数据流水线（即，数据转换）的一些方法。
+
+- [自定义数据流水线](#自定义数据流水线)
+  - [数据流水线设计](#数据流水线设计)
+  - [修改训练/测试数据流水线](#修改训练/测试数据流水线)
+    - [加载](#加载)
+    - [采样帧和其他处理](#采样帧和其他处理)
+    - [格式化](#格式化)
+  - [添加新的数据转换](#添加新的数据转换)
+
+## 数据流水线设计
+
+数据流水线指的是从数据集索引样本时处理数据样本字典的过程，它包括一系列的数据转换。每个数据转换接受一个 `dict` 作为输入，对其进行处理，并产生一个 `dict` 作为输出，供序列中的后续数据转换使用。
+
+以下是一个例子，用于使用 `VideoDataset` 在 Kinetics 上训练 SlowFast 的数据流水线。这个数据流水线首先使用 [`decord`](https://github.com/dmlc/decord) 读取原始视频并随机采样一个视频剪辑，该剪辑包含 `32` 帧，帧间隔为 `2`。然后，它对所有帧应用随机大小调整的裁剪和随机水平翻转，然后将数据形状格式化为 `NCTHW`，在这个例子中，它是 `(1, 3, 32, 224, 224)`。
+
+```python
+train_pipeline = [
+    dict(type='DecordInit',),
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+```
+
+MMAction2 中所有可用的数据转换的详细列表可以在 [mmaction.datasets.transforms](mmaction.datasets.transforms) 中找到。
+
+## 修改训练/测试数据流水线
+
+MMAction2 的数据流水线非常灵活，因为几乎每一步的数据预处理都可以从配置文件中进行配置。然而，对于一些用户来说，这种多样性可能会让人感到不知所措。
+
+以下是一些用于构建动作识别任务数据流水线的一般实践和指南。
+
+### 加载
+
+在数据流水线的开始，通常是加载视频。然而，如果帧已经被提取出来，你应该使用 `RawFrameDecode` 并修改数据集类型为 `RawframeDataset`。
+
+```python
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+```
+
+如果你需要从具有不同格式（例如，`pkl`，`bin`等）的文件或从特定位置加载数据，你可以创建一个新的加载转换并将其包含在数据流水线的开始。有关更多详细信息，请参阅[添加新的数据转换](#添加新的数据转换)。
+
+### 采样帧和其他处理
+
+在训练和测试过程中，我们可能会有从视频中采样帧的不同策略。
+
+例如，当测试 SlowFast 时，我们会均匀地采样多个剪辑，如下所示：
+
+```python
+test_pipeline = [
+    ...
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=10,
+        test_mode=True),
+    ...
+]
+```
+
+在上述例子中，每个视频将均匀地采样10个视频剪辑，每个剪辑包含32帧。 `test_mode=True` 用于实现这一点，与训练期间的随机采样相反。
+
+另一个例子涉及 `TSN/TSM` 模型，它们从视频中采样多个片段：
+
+```python
+train_pipeline = [
+    ...
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    ...
+]
+```
+
+通常，数据流水线中的数据增强只处理视频级的转换，例如调整大小或裁剪，而不处理像视频标准化或 mixup/cutmix 这样的转换。这是因为我们可以在批量视频数据上进行视频标准化和 mixup/cutmix，以使用 GPU 加速处理。要配置视频标准化和 mixup/cutmix，请使用 [mmaction.models.utils.data_preprocessor](mmaction.models.utils.data_preprocessor)。
+
+### 格式化
+
+格式化涉及从数据信息字典中收集训练数据，并将其转换为与模型兼容的格式。
+
+在大多数情况下，你可以简单地使用 [`PackActionInputs`](mmaction.datasets.transforms.PackActionInputs)，它将以 `NumPy Array` 格式的图像转换为 `PyTorch Tensor`，并将地面真实类别信息和其他元信息打包为一个类似字典的对象 [`ActionDataSample`](mmaction.structures.ActionDataSample)。
+
+```python
+train_pipeline = [
+    ...
+    dict(type='PackActionInputs'),
+]
+```
+
+## 添加新的数据转换
+
+1. 要创建一个新的数据转换，编写一个新的转换类在一个 Python 文件中，例如，名为 `my_transforms.py`。数据转换类必须继承 [`mmcv.transforms.BaseTransform`](mmcv.transforms.BaseTransform) 类，并重写 `transform` 方法，该方法接受一个 `dict` 作为输入并返回一个 `dict`。最后，将 `my_transforms.py` 放在 `mmaction/datasets/transforms/` 文件夹中。
+
+   ```python
+   from mmcv.transforms import BaseTransform
+   from mmaction.datasets import TRANSFORMS
+
+   @TRANSFORMS.register_module()
+   class MyTransform(BaseTransform):
+        def __init__(self, msg):
+            self.msg = msg
+
+       def transform(self, results):
+           # 修改数据信息字典 `results`。
+           print(msg, 'MMAction2.')
+           return results
+   ```
+
+2. 在 `mmaction/datasets/transforms/__init__.py` 中导入新类。
+
+   ```python
+   ...
+   from .my_transform import MyTransform
+
+   __all__ = [
+       ..., 'MyTransform'
+   ]
+   ```
+
+3. 在配置文件中使用它。
+
+   ```python
+   train_pipeline = [
+       ...
+       dict(type='MyTransform', msg='Hello!'),
+       ...
+   ]
+   ```
diff --git a/docs/zh_cn/advanced_guides/dataflow.md b/docs/zh_cn/advanced_guides/dataflow.md
new file mode 100644
index 0000000000000000000000000000000000000000..c3c7273aff2ce3a3c1c8eac668b9ae2a292ca55e
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/dataflow.md
@@ -0,0 +1,3 @@
+# MMAction2 的数据流
+
+内容建设中...
diff --git a/docs/zh_cn/advanced_guides/depoly.md b/docs/zh_cn/advanced_guides/depoly.md
new file mode 100644
index 0000000000000000000000000000000000000000..82fab764a856d26c5575a22f24743411b4e54a5f
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/depoly.md
@@ -0,0 +1,3 @@
+# How to deploy MMAction2 models
+
+coming soon...
diff --git a/docs/zh_cn/api.rst b/docs/zh_cn/api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f3f688462bc92067c883eb4c61bc9246c271f659
--- /dev/null
+++ b/docs/zh_cn/api.rst
@@ -0,0 +1,140 @@
+mmaction.apis
+--------------
+.. automodule:: mmaction.apis
+    :members:
+
+mmaction.datasets
+--------------
+
+datasets
+^^^^^^^^^^
+.. automodule:: mmaction.datasets
+    :members:
+
+transforms
+^^^^^^^^^^^^
+.. automodule:: mmaction.datasets.transforms
+    :members:
+
+mmaction.engine
+--------------
+
+hooks
+^^^^^^^^^^
+.. automodule:: mmaction.engine.hooks
+    :members:
+
+optimizers
+^^^^^^^^^^^^^^^
+.. automodule:: mmaction.engine.optimizers
+    :members:
+
+runner
+^^^^^^^^^^
+.. automodule:: mmaction.engine.runner
+    :members:
+
+
+mmaction.evaluation
+--------------------
+
+functional
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.evaluation.functional
+    :members:
+
+metrics
+^^^^^^^^^^
+.. automodule:: mmaction.evaluation.metrics
+    :members:
+
+
+mmaction.models
+--------------
+
+backbones
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.backbones
+    :members:
+
+common
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.common
+    :members:
+
+data_preprocessors
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.data_preprocessors
+    :members:
+
+heads
+^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.heads
+    :members:
+
+localizers
+^^^^^^^^^^
+.. automodule:: mmaction.models.localizers
+    :members:
+
+
+losses
+^^^^^^^^^^
+.. automodule:: mmaction.models.losses
+    :members:
+
+necks
+^^^^^^^^^^^^
+.. automodule:: mmaction.models.necks
+    :members:
+
+roi_heads
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.roi_heads
+    :members:
+
+recognizers
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.seg_heads
+    :members:
+
+task_modules
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.task_modules
+    :members:
+
+
+utils
+^^^^^^^^^^
+.. automodule:: mmaction.models.utils
+    :members:
+
+
+mmaction.structures
+--------------------
+
+structures
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.structures
+    :members:
+
+bbox
+^^^^^^^^^^
+.. automodule:: mmaction.structures.bbox
+    :members:
+
+
+mmaction.testing
+----------------
+.. automodule:: mmaction.testing
+    :members:
+
+mmaction.visualization
+--------------------
+.. automodule:: mmaction.visualization
+    :members:
+
+mmaction.utils
+--------------
+.. automodule:: mmaction.utils
+    :members:
diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..8413d472e248c835b8ebf485cf1e5d6fc9bb5961
--- /dev/null
+++ b/docs/zh_cn/conf.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import subprocess
+import sys
+
+import pytorch_sphinx_theme
+
+sys.path.insert(0, os.path.abspath('../..'))
+
+# -- Project information -----------------------------------------------------
+
+project = 'MMAction2'
+copyright = '2020, OpenMMLab'
+author = 'MMAction2 Authors'
+version_file = '../.././mmaction/version.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+# The full version, including alpha/beta/rc tags
+release = get_version()
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'myst_parser',
+    'sphinx_copybutton',
+    'sphinx_tabs.tabs',
+    'notfound.extension',
+    'sphinxcontrib.jquery',
+]
+
+# numpy and torch are required
+autodoc_mock_imports = ['mmaction.version', 'PIL']
+
+copybutton_prompt_text = r'>>> |\.\.\. '
+copybutton_prompt_is_regexp = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# -- Options for HTML output -------------------------------------------------
+source_suffix = {'.rst': 'restructuredtext', '.md': 'markdown'}
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'pytorch_sphinx_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+
+html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+html_theme_options = {
+    # 'logo_url': 'https://mmaction2.readthedocs.io/en/latest/',
+    'menu': [
+        {
+            'name':
+            'Tutorial',
+            'url':
+            'https://colab.research.google.com/github/'
+            'open-mmlab/mmaction2/blob/master/demo/mmaction2_tutorial.ipynb'
+        },
+        {
+            'name': 'GitHub',
+            'url': 'https://github.com/open-mmlab/mmaction2'
+        },
+        {
+            'name':
+            'Upstream',
+            'children': [{
+                'name':
+                'MMCV',
+                'url':
+                'https://github.com/open-mmlab/mmcv',
+                'description':
+                'Foundational library for computer vision'
+            }, {
+                'name':
+                'MMPreTrain',
+                'url':
+                'https://github.com/open-mmlab/mmpretrain',
+                'description':
+                'Open source pre-training toolbox based on PyTorch'
+            }, {
+                'name':
+                'MMDetection',
+                'url':
+                'https://github.com/open-mmlab/mmdetection',
+                'description':
+                'Object detection toolbox and benchmark'
+            }, {
+                'name':
+                'MMPose',
+                'url':
+                'https://github.com/open-mmlab/mmpose',
+                'description':
+                'Open-source toolbox for pose estimation based on PyTorch.'
+            }]
+        },
+    ],
+    # Specify the language of shared menu
+    'menu_lang':
+    'en'
+}
+
+language = 'en'
+master_doc = 'index'
+
+html_static_path = ['_static']
+html_css_files = [
+    'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css',
+    'css/readthedocs.css'
+]
+html_js_files = [
+    'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js',
+    'js/custom.js'
+]
+
+myst_enable_extensions = ['colon_fence']
+myst_heading_anchors = 3
+
+# The not found page
+notfound_template = '404.html'
+
+
+def builder_inited_handler(app):
+    if subprocess.run(['python', './stat.py']).returncode != 0:
+        raise RuntimeError('Failed to run the script `stat.py`.')
+    if subprocess.run(['python', './project_zoo.py']).returncode != 0:
+        raise RuntimeError('Failed to run the script `project_zoo.py`.')
+    if subprocess.run(['python', './dataset_zoo.py']).returncode != 0:
+        raise RuntimeError('Failed to run the script `dataset_zoo.py`.')
+
+
+def setup(app):
+    app.connect('builder-inited', builder_inited_handler)
diff --git a/docs/zh_cn/dataset_zoo.py b/docs/zh_cn/dataset_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a637cb21ef8eb3775745153994537356c732e1f
--- /dev/null
+++ b/docs/zh_cn/dataset_zoo.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+import re
+from pathlib import Path
+
+from utils import replace_link
+
+DATASETS_ROOT = Path('dataset_zoo')  # Path to save generated paper pages.
+DATASETZOO_TEMPLATE = """\
+# 数据集统计
+
+在本页面中，我们列举了我们支持的[所有数据集](#所有已支持的数据集)。你可以点击链接跳转至对应的数据集详情页面。
+
+## 所有已支持的数据集
+
+* 数据集数量：{num_datasets}
+{dataset_msg}
+
+"""  # noqa: E501
+
+
+def generate_datasets_pages():
+    dataset_list = Path('../../tools/data').glob('*/README.md')
+    num_datasets = 0
+    dataset_msgs = []
+
+    for file in dataset_list:
+        num_datasets += 1
+
+        copy = DATASETS_ROOT / file.parent.with_suffix('.md').name
+
+        title_template = r'^# Preparing (.*)'
+        # use chinese doc if exist
+        chinese_readme = Path(
+            str(file).replace('README.md', 'README_zh-CN.md'))
+        if chinese_readme.exists():
+            file = chinese_readme
+            title_template = r'^# 准备(.*)'
+        with open(file, 'r') as f:
+            content = f.read()
+
+        title = re.match(title_template, content).group(1)
+        title = title.lstrip(' ')
+        content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content,
+                               file)
+        content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content,
+                               file)
+        dataset_msgs.append(f'\t - [{title}]({copy})')
+
+        with open(copy, 'w') as f:
+            f.write(content)
+
+    dataset_msg = '\n'.join(dataset_msgs)
+
+    modelzoo = DATASETZOO_TEMPLATE.format(
+        num_datasets=num_datasets,
+        dataset_msg=dataset_msg,
+    )
+
+    with open('datasetzoo_statistics.md', 'w') as f:
+        f.write(modelzoo)
+
+
+DATASETS_ROOT.mkdir(exist_ok=True)
+generate_datasets_pages()
diff --git a/docs/zh_cn/docutils.conf b/docs/zh_cn/docutils.conf
new file mode 100644
index 0000000000000000000000000000000000000000..ddd79c377666db4a615151f0676f7fec32d38359
--- /dev/null
+++ b/docs/zh_cn/docutils.conf
@@ -0,0 +1,2 @@
+[html writers]
+table_style: colwidths-auto
diff --git a/docs/zh_cn/get_started/contribution_guide.md b/docs/zh_cn/get_started/contribution_guide.md
new file mode 100644
index 0000000000000000000000000000000000000000..439214c520a12b2542ae0dd5560f358e8fe2ebc1
--- /dev/null
+++ b/docs/zh_cn/get_started/contribution_guide.md
@@ -0,0 +1,62 @@
+# 参与贡献 MMACTION2
+
+欢迎各种形式的贡献，包括但不限于以下内容。
+
+- 修改拼写错误或代码错误
+- 新功能和组件
+- 添加文档或将文档翻译成其他语言
+- 添加关于视频理解算法的新项目（推荐），具体细节请参考[这里](../projectzoo.md)
+
+## 工作流程
+
+1. Fork 并拉取最新的 mmaction2
+2. 创建一个有意义的新分支（不要使用主分支进行 PR）
+3. 提交你的更改
+4. 创建一个 PR
+
+```{note}
+- 如果你计划添加一些涉及大规模更改的新功能，请首先打开一个 issue 进行讨论。
+- 如果你是论文的作者，并希望将你的方法包含在 mmaction2 中，请与我们联系。我们将非常感谢您的贡献。
+```
+
+## 代码风格
+
+### Python
+
+我们采用 [PEP8](https://www.python.org/dev/peps/pep-0008/) 作为首选代码风格。
+
+我们使用以下工具进行代码检查和格式化：
+
+- [flake8](http://flake8.pycqa.org/en/latest/)：检查器
+- [yapf](https://github.com/google/yapf)：格式化器
+- [isort](https://github.com/timothycrosley/isort)：排序导入
+- [codespell](https://github.com/codespell-project/codespell)：一个用于修复文本文件中常见拼写错误的 Python 工具。
+- [mdformat](https://github.com/executablebooks/mdformat)：Mdformat 是一个自由裁量的 Markdown 格式化工具，可用于强制执行一致的 Markdown 文件样式。
+- [docformatter](https://github.com/myint/docformatter)：一个格式化工具，用于格式化文档字符串。
+
+yapf 和 isort 的样式配置可以在 [setup.cfg](https://github.com/open-mmlab/mmaction2/blob/main/setup.cfg) 中找到。
+
+我们使用 [pre-commit hook](https://pre-commit.com/) 来保证每次提交时自动进行代码检查和格式化，启用的功能包括 `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, 修复 `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, 对 `requirments.txt`的排序等。
+预提交钩子的配置存储在 [.pre-commit-config](https://github.com/open-mmlab/mmaction2/blob/main/.pre-commit-config.yaml) 中。
+
+在克隆仓库后，你需要安装初始化的预提交钩子。
+
+```shell
+pip install -U pre-commit
+```
+
+从仓库文件夹中
+
+```shell
+pre-commit install
+```
+
+在此之后，每次提交，代码规范检查和格式化工具都将被强制执行。
+
+```{note}
+在创建 PR 之前，请确保你的代码通过了 lint 检查并由 yapf 进行了格式化。
+```
+
+### C++ 和 CUDA
+
+我们遵循 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)。
diff --git a/docs/zh_cn/get_started/faq.md b/docs/zh_cn/get_started/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..ba2cce86675933f3852bdba1a154d8f7a8e9ebbd
--- /dev/null
+++ b/docs/zh_cn/get_started/faq.md
@@ -0,0 +1,125 @@
+# 常见问题解答
+
+## 概述
+
+我们在这里列出了许多用户常遇到的问题以及相应的解决方案。
+
+- [常见问题解答](#常见问题解答)
+  - [概述](#概述)
+  - [安装](#安装)
+  - [数据](#数据)
+  - [训练](#训练)
+  - [测试](#测试)
+
+如果您发现任何频繁出现的问题并且有解决方法，欢迎在列表中补充。如果这里的内容没有涵盖您的问题，请使用[提供的模板](https://github.com/open-mmlab/mmaction2/tree/main/.github/ISSUE_TEMPLATE/error-report.md)创建一个问题，并确保在模板中填写所有必要的信息。
+
+## 安装
+
+- **"No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"**
+
+  1. 使用 `pip uninstall mmcv` 命令卸载环境中的现有 mmcv。
+  2. 参照[安装说明](https://mmcv.readthedocs.io/en/2.x/get_started/installation.html#install-mmcv)安装 mmcv。
+
+- **"OSError: MoviePy Error: creation of None failed because of the following error"**
+
+  使用 `pip install moviepy` 安装。更多信息可以参考[官方安装文档](https://zulko.github.io/moviepy/install.html), 请注意（根据这个 [issue](https://github.com/Zulko/moviepy/issues/693)）：
+
+  1. 对于 Windows 用户，[ImageMagick](https://www.imagemagick.org/script/index.php) 不会自动被 MoviePy 检测到，需要修改 `moviepy/config_defaults.py` 文件，提供 ImageMagick 二进制文件 `magick` 的路径，例如 `IMAGEMAGICK_BINARY = "C:\\Program Files\\ImageMagick_VERSION\\magick.exe"`
+  2. 对于 Linux 用户，如果 MoviePy 没有检测到 ImageMagick，需要修改 `/etc/ImageMagick-6/policy.xml` 文件，将 `<policy domain="path" rights="none" pattern="@*" />` 注释掉，改为 `<!-- <policy domain="path" rights="none" pattern="@*" /> -->`。
+
+- **"即使我已经安装了 XXCODEBASE，为什么还会收到 'Please install XXCODEBASE to use XXX' 的错误消息?"**
+
+  您收到该错误消息是因为我们的项目无法从 XXCODEBASE 中导入一个函数或类。您可以尝试运行相应的代码行来查看发生了什么。一个可能的原因是，在 OpenMMLAB 的某些代码库中，您需要在安装它们之前先安装 mmcv 和 mmengine。您可以按照[教程](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html#installation)来安装它们。
+
+## 数据
+
+- **FileNotFound 错误，例如 `No such file or directory: xxx/xxx/img_00300.jpg`**
+
+  在我们的仓库中，我们将 `start_index=1` 设置为 rawframe 数据集的默认值，将 `start_index=0` 设置为视频数据集的默认值。如果用户遇到数据的第一帧或最后一帧的 FileNotFound 错误，需要检查以 0 或 1 作为偏移量开始的文件，例如 `xxx_00000.jpg` 或 `xxx_00001.jpg`，然后在配置文件中更改数据处理流水线的 `start_index` 值。
+
+- **我们应该如何预处理数据集中的视频？将它们调整为固定大小（所有视频的高宽比相同），例如 `340x256`（1），还是调整它们使得所有视频的短边具有相同的长度（256px 或 320px）（2）？**
+
+  我们尝试过这两种预处理方法，并发现（2）通常是更好的解决方案，因此我们使用（2）作为默认的预处理设置，短边长度为 256px。我们对这些预处理方法进行了基准测试，您可以在[TSN 数据基准测试](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn)和[SlowOnly 数据基准测试](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/slowonly)中找到结果。
+
+- **数据处理流水线中的项不匹配导致出现类似 `KeyError: 'total_frames'` 的错误**
+
+  我们有用于处理视频和帧的两个处理流水线。
+
+  **对于视频**，我们应该在处理流水线中动态解码视频，所以在这种情况下应该使用 `DecordInit & DecordDecode`、`OpenCVInit & OpenCVDecode` 或 `PyAVInit & PyAVDecode` 这样的配对，例如[这个示例](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py#L14-L16)。
+
+  **对于帧**，图像已经在离线状态下解码，所以在这种情况下应该使用 `RawFrameDecode` 这样的处理流水线项，例如[这个示例](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py#L17)。
+
+  `KeyError: 'total_frames'` 是由于错误地将 `RawFrameDecode` 步骤用于视频，因为当输入是视频时，无法预先获取 `total_frames`。
+
+## 训练
+
+- **如何只使用训练好的识别模型进行主干网络的预训练？**
+
+  为了使用预训练模型进行整个网络的训练，新的配置文件在 `load_from` 中添加了预训练模型的链接。
+
+  要使用主干进行预训练，可以将配置文件中主干部分的 `pretrained` 值更改为权重路径/URL。在训练时，未预料到的键将被忽略。
+
+- **在微调模型时如何固定主干的某些阶段？**
+
+  您可以参考 [`def _freeze_stages()`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/models/backbones/resnet3d.py#L791) 和 [`frozen_stages`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/models/backbones/resnet3d.py#L369-L370)。
+  提醒在配置文件中设置 `find_unused_parameters = True`，以进行分布式训练或测试。
+
+  实际上，除了少数模型，如 C3D 等，用户可以设置 `frozen_stages` 来冻结主干的阶段，因为几乎所有继承自 `ResNet` 和 `ResNet3D` 的主干都支持内部函数 `_freeze_stages()`。
+
+- **如何在配置文件中设置 memcached ？**
+
+  在 MMAction2 中，您可以将 memcached 的参数传递给用于视频数据集的 `class DecordInit` 或用于原始帧数据集的 `RawFrameDecode`。有关更多细节，请参阅 MMEngine 中的 [`class FileClient`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/data/pipelines/file_client.py)。以下是一个示例，演示如何在原始帧数据集中使用 memcached：
+
+  ```python
+  mc_cfg = dict(server_list_cfg='server_list_cfg', client_cfg='client_cfg', sys_path='sys_path')
+
+  train_pipeline = [
+    ...
+    dict(type='RawFrameDecode', io_backend='memcached', **mc_cfg),
+    ...
+  ]
+  ```
+
+- **如何在配置文件中设置 `load_from` 的值以微调模型？**
+
+  在 MMAction2 中，我们将 `load_from=None` 设置为 `configs/_base_/default_runtime.py` 中的默认值，并且由于[继承设计](https://github.com/open-mmlab/mmaction2/tree/main/docs/en/user_guides/config.md)，用户可以直接通过在其配置文件中设置 `load_from` 来更改它。
+
+- **如何在训练时使用 `RawFrameDataset`？**
+
+  在 MMAction2 1.x 版本中，大多数配置文件默认使用 `VideoDataset` 作为数据集类型，这对于文件存储更加友好。如果您想使用 `RawFrameDataset`，需要进行两个修改步骤：
+
+  - `dataset` 相关：
+    将 `train_dataloader`/`val_dataloader`/`test_dataloader` 中的 `dataset` 从
+
+    ```
+    dataset=dict(
+        type=VideoDataset,
+        data_prefix=dict(video=xxx),
+        ...)
+    ```
+
+    修改为
+
+    ```
+    dataset=dict(
+        type=RawFrameDataset,
+        data_prefix=dict(img=xxx),
+        filename_tmpl='{:05}.jpg',
+        ...)
+    ```
+
+    数据集的其他字段不需要修改。请确保 `filename_tmpl` 与帧数据匹配，并参考[配置文件文档](../user_guides/config.md)了解更多关于配置文件的详细信息。
+
+  - `transform` 相关：在 `train_pipeline`/`val_pipeline`/`test_pipeline` 中删除 `dict(type='DecordInit', **file_client_args)`，将 `dict(type='DecordDecode')` 修改为 `dict(type='RawFrameDecode', **file_client_args)`，并确保在配置文件中定义了 `file_client_args = dict(io_backend='disk')`。
+
+  有关自定义数据集的更多修改，请参考[准备数据集](../user_guides/prepare_dataset.md)和[自定义数据集](../advanced_guides/customize_dataset.md)。
+
+## 测试
+
+- **如何使预测得分在 softmax 内归一化到 \[0, 1\] ?**
+
+  在配置文件中将 `model.cls_head.average_clips` 设置为 `'prob'`。
+
+- **如果模型过大，GPU 内存无法容纳甚至只有一个测试样本怎么办？**
+
+  默认情况下，3D 模型使用 10 个 clips x 3 个 crops 进行测试，总共有 30 个视图。对于非常大的模型，即使只有一个测试样本，GPU 内存也无法容纳（因为有 30 个视图）。为了解决这个问题，您可以在配置文件的 `model['test_cfg']` 中设置 `max_testing_views=n`。这样，在前向传播过程中，会使用 n 个视图作为一个批次，以节省 GPU 内存的使用。
diff --git a/docs/zh_cn/get_started/guide_to_framework.md b/docs/zh_cn/get_started/guide_to_framework.md
new file mode 100644
index 0000000000000000000000000000000000000000..63264365f17a66f41c2862d06fe370acd4f08dbc
--- /dev/null
+++ b/docs/zh_cn/get_started/guide_to_framework.md
@@ -0,0 +1,761 @@
+# 20分钟了解 MMAction2 框架设计
+
+在本教程中，我们将通过一个视频动作识别的手把手教程来演示 `MMACTION2 1.0` 的整体架构。
+
+本教程的目录如下:
+
+- [20分钟了解 MMAction2 框架设计](#20分钟了解-mmaction2-框架设计)
+  - [步骤0：准备数据](#步骤0准备数据)
+  - [步骤1：构建一个数据流水线](#步骤1构建一个数据流水线)
+  - [步骤2：构建一个数据集和数据加载器](#步骤2构建一个数据集和数据加载器)
+  - [步骤3：构建一个识别器](#步骤3构建一个识别器)
+  - [步骤4：构建一个评估指标](#步骤4构建一个评估指标)
+  - [步骤5：使用本地 PyTorch 训练和测试](#步骤5使用本地-pytorch-训练和测试)
+  - [步骤6：使用 MMEngine 训练和测试（推荐）](#步骤6使用-mmengine-训练和测试推荐)
+
+首先，我们需要初始化注册表的 `scope` ，以确保每个模块都在 `mmaction` 范围下注册。有关注册表的更多详细信息，请参考[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html) 。
+
+```python
+from mmaction.utils import register_all_modules
+
+register_all_modules(init_default_scope=True)
+```
+
+## 步骤0：准备数据
+
+请下载我们准备的[精简版 kinetics400](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) 数据集，并将其提取到 `$MMACTION2/data` 目录。
+
+解压后的目录结构应如下所示:
+
+```
+mmaction2
+├── data
+│   ├── kinetics400_tiny
+│   │    ├── kinetics_tiny_train_video.txt
+│   │    ├── kinetics_tiny_val_video.txt
+│   │    ├── train
+│   │    │   ├── 27_CSXByd3s.mp4
+│   │    │   ├── 34XczvTaRiI.mp4
+│   │    │   ├── A-wiliK50Zw.mp4
+│   │    │   ├── ...
+│   │    └── val
+│   │       ├── 0pVGiAU6XEA.mp4
+│   │       ├── AQrbRSnRt8M.mp4
+│   │       ├── ...
+```
+
+以下是标注文件 `kinetics_tiny_train_video.txt` 中的一些示例:
+
+```
+D32_1gwq35E.mp4 0
+iRuyZSKhHRg.mp4 1
+oXy-e_P_cAI.mp4 0
+34XczvTaRiI.mp4 1
+h2YqqUhnR34.mp4 0
+```
+
+文件中的每一行表示每一个视频的标注，其中第一项表示视频文件名(如 `D32_1gwq35E.mp4` )，第二项表示相应的标签(如 `D32_1gwq35E.mp4` 的标签是 `0` )。在这个数据集中，只有 `两个` 类别。
+
+## 步骤1：构建一个数据流水线
+
+为了实现 `解码`、`采样`、`调整大小`、`裁剪`、`格式化` 和 `打包` 视频数据和相应的标签，我们需要设计一个数据流水线来处理这些过程。具体来说，我们设计了7个 `Transform` 类来构建这个视频处理流水线。注意，OpenMMLab 中的所有`Transform` 类都必须继承自 `mmcv` 中的 `BaseTransform` 类，实现抽象方法 `transform`，并注册到 `TRANSFORMS` 注册表。有关数据转换的更多详细信息，请参阅[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_transform.html) 。
+
+```python
+import mmcv
+import decord
+import numpy as np
+from mmcv.transforms import TRANSFORMS, BaseTransform, to_tensor
+from mmaction.structures import ActionDataSample
+
+
+@TRANSFORMS.register_module()
+class VideoInit(BaseTransform):
+    def transform(self, results):
+        container = decord.VideoReader(results['filename'])
+        results['total_frames'] = len(container)
+        results['video_reader'] = container
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoSample(BaseTransform):
+    def __init__(self, clip_len, num_clips, test_mode=False):
+        self.clip_len = clip_len
+        self.num_clips = num_clips
+        self.test_mode = test_mode
+
+    def transform(self, results):
+        total_frames = results['total_frames']
+        interval = total_frames // self.clip_len
+
+        if self.test_mode:
+            # 使测试期间的采样具有确定性
+            np.random.seed(42)
+
+        inds_of_all_clips = []
+        for i in range(self.num_clips):
+            bids = np.arange(self.clip_len) * interval
+            offset = np.random.randint(interval, size=bids.shape)
+            inds = bids + offset
+            inds_of_all_clips.append(inds)
+
+        results['frame_inds'] = np.concatenate(inds_of_all_clips)
+        results['clip_len'] = self.clip_len
+        results['num_clips'] = self.num_clips
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoDecode(BaseTransform):
+    def transform(self, results):
+        frame_inds = results['frame_inds']
+        container = results['video_reader']
+
+        imgs = container.get_batch(frame_inds).asnumpy()
+        imgs = list(imgs)
+
+        results['video_reader'] = None
+        del container
+
+        results['imgs'] = imgs
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoResize(BaseTransform):
+    def __init__(self, r_size):
+        self.r_size = (np.inf, r_size)
+
+    def transform(self, results):
+        img_h, img_w = results['img_shape']
+        new_w, new_h = mmcv.rescale_size((img_w, img_h), self.r_size)
+
+        imgs = [mmcv.imresize(img, (new_w, new_h))
+                for img in results['imgs']]
+        results['imgs'] = imgs
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoCrop(BaseTransform):
+    def __init__(self, c_size):
+        self.c_size = c_size
+
+    def transform(self, results):
+        img_h, img_w = results['img_shape']
+        center_x, center_y = img_w // 2, img_h // 2
+        x1, x2 = center_x - self.c_size // 2, center_x + self.c_size // 2
+        y1, y2 = center_y - self.c_size // 2, center_y + self.c_size // 2
+        imgs = [img[y1:y2, x1:x2] for img in results['imgs']]
+        results['imgs'] = imgs
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoFormat(BaseTransform):
+    def transform(self, results):
+        num_clips = results['num_clips']
+        clip_len = results['clip_len']
+        imgs = results['imgs']
+
+        # [num_clips*clip_len, H, W, C]
+        imgs = np.array(imgs)
+        # [num_clips, clip_len, H, W, C]
+        imgs = imgs.reshape((num_clips, clip_len) + imgs.shape[1:])
+        # [num_clips, C, clip_len, H, W]
+        imgs = imgs.transpose(0, 4, 1, 2, 3)
+
+        results['imgs'] = imgs
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoPack(BaseTransform):
+    def __init__(self, meta_keys=('img_shape', 'num_clips', 'clip_len')):
+        self.meta_keys = meta_keys
+
+    def transform(self, results):
+        packed_results = dict()
+        inputs = to_tensor(results['imgs'])
+        data_sample = ActionDataSample().set_gt_label(results['label'])
+        metainfo = {k: results[k] for k in self.meta_keys if k in results}
+        data_sample.set_metainfo(metainfo)
+        packed_results['inputs'] = inputs
+        packed_results['data_samples'] = data_sample
+        return packed_results
+```
+
+下面，我们提供了一个代码片段(使用标注文件中的 `D32_1gwq35E.mp4 0` )来演示如何使用数据流水线。
+
+```python
+import os.path as osp
+from mmengine.dataset import Compose
+
+pipeline_cfg = [
+    dict(type='VideoInit'),
+    dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False),
+    dict(type='VideoDecode'),
+    dict(type='VideoResize', r_size=256),
+    dict(type='VideoCrop', c_size=224),
+    dict(type='VideoFormat'),
+    dict(type='VideoPack')
+]
+
+pipeline = Compose(pipeline_cfg)
+data_prefix = 'data/kinetics400_tiny/train'
+results = dict(filename=osp.join(data_prefix, 'D32_1gwq35E.mp4'), label=0)
+packed_results = pipeline(results)
+
+inputs = packed_results['inputs']
+data_sample = packed_results['data_samples']
+
+print('shape of the inputs: ', inputs.shape)
+
+# 获取输入的信息
+print('image_shape: ', data_sample.img_shape)
+print('num_clips: ', data_sample.num_clips)
+print('clip_len: ', data_sample.clip_len)
+
+# 获取输入的标签
+print('label: ', data_sample.gt_label)
+```
+
+```
+shape of the inputs:  torch.Size([1, 3, 16, 224, 224])
+image_shape:  (224, 224)
+num_clips:  1
+clip_len:  16
+label:  tensor([0])
+```
+
+## 步骤2：构建一个数据集和数据加载器
+
+OpenMMLab中的所有 `Dataset` 类都必须继承自 `mmengine` 中的 `BaseDataset` 类。我们可以通过覆盖 `load_data_list` 方法来定制注释加载过程。此外，我们可以通过覆盖 `get_data_info` 方法，向 `results` 字典添加更多字段，它将作为输入传给 `pipeline` 。有关 `BaseDataset` 类的更多详细信息，请参阅[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) 。
+
+```python
+import os.path as osp
+from mmengine.fileio import list_from_file
+from mmengine.dataset import BaseDataset
+from mmaction.registry import DATASETS
+
+
+@DATASETS.register_module()
+class DatasetZelda(BaseDataset):
+    def __init__(self, ann_file, pipeline, data_root, data_prefix=dict(video=''),
+                 test_mode=False, modality='RGB', **kwargs):
+        self.modality = modality
+        super(DatasetZelda, self).__init__(ann_file=ann_file, pipeline=pipeline, data_root=data_root,
+                                           data_prefix=data_prefix, test_mode=test_mode,
+                                           **kwargs)
+
+    def load_data_list(self):
+        data_list = []
+        fin = list_from_file(self.ann_file)
+        for line in fin:
+            line_split = line.strip().split()
+            filename, label = line_split
+            label = int(label)
+            filename = osp.join(self.data_prefix['video'], filename)
+            data_list.append(dict(filename=filename, label=label))
+        return data_list
+
+    def get_data_info(self, idx: int) -> dict:
+        data_info = super().get_data_info(idx)
+        data_info['modality'] = self.modality
+        return data_info
+```
+
+接下来，我们将演示如何使用 dataset 和 dataloader 来索引数据。我们将使用 `Runner.build_dataloader` 方法来构造 dataloader。有关 dataloader 的更多详细信息，请参阅[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/tutorials/dataset.html#details-on-dataloader) 。
+
+```python
+from mmaction.registry import DATASETS
+
+train_pipeline_cfg = [
+    dict(type='VideoInit'),
+    dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False),
+    dict(type='VideoDecode'),
+    dict(type='VideoResize', r_size=256),
+    dict(type='VideoCrop', c_size=224),
+    dict(type='VideoFormat'),
+    dict(type='VideoPack')
+]
+
+val_pipeline_cfg = [
+    dict(type='VideoInit'),
+    dict(type='VideoSample', clip_len=16, num_clips=5, test_mode=True),
+    dict(type='VideoDecode'),
+    dict(type='VideoResize', r_size=256),
+    dict(type='VideoCrop', c_size=224),
+    dict(type='VideoFormat'),
+    dict(type='VideoPack')
+]
+
+train_dataset_cfg = dict(
+    type='DatasetZelda',
+    ann_file='kinetics_tiny_train_video.txt',
+    pipeline=train_pipeline_cfg,
+    data_root='data/kinetics400_tiny/',
+    data_prefix=dict(video='train'))
+
+val_dataset_cfg = dict(
+    type='DatasetZelda',
+    ann_file='kinetics_tiny_val_video.txt',
+    pipeline=val_pipeline_cfg,
+    data_root='data/kinetics400_tiny/',
+    data_prefix=dict(video='val'))
+
+train_dataset = DATASETS.build(train_dataset_cfg)
+
+packed_results = train_dataset[0]
+
+inputs = packed_results['inputs']
+data_sample = packed_results['data_samples']
+
+print('shape of the inputs: ', inputs.shape)
+
+# 获取输入的信息
+print('image_shape: ', data_sample.img_shape)
+print('num_clips: ', data_sample.num_clips)
+print('clip_len: ', data_sample.clip_len)
+
+# 获取输入的标签
+print('label: ', data_sample.gt_label)
+
+from mmengine.runner import Runner
+
+BATCH_SIZE = 2
+
+train_dataloader_cfg = dict(
+    batch_size=BATCH_SIZE,
+    num_workers=0,
+    persistent_workers=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=train_dataset_cfg)
+
+val_dataloader_cfg = dict(
+    batch_size=BATCH_SIZE,
+    num_workers=0,
+    persistent_workers=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=val_dataset_cfg)
+
+train_data_loader = Runner.build_dataloader(dataloader=train_dataloader_cfg)
+val_data_loader = Runner.build_dataloader(dataloader=val_dataloader_cfg)
+
+batched_packed_results = next(iter(train_data_loader))
+
+batched_inputs = batched_packed_results['inputs']
+batched_data_sample = batched_packed_results['data_samples']
+
+assert len(batched_inputs) == BATCH_SIZE
+assert len(batched_data_sample) == BATCH_SIZE
+```
+
+终端输出应该与[步骤1：构建一个数据流水线](#步骤1：构建一个数据流水线)中的输出相同。
+
+## 步骤3：构建一个识别器
+
+接下来，我们将构建 `recognizer`，它主要由三部分组成：用于批处理和规范化数据的 `data preprocessor`，用于特征提取的 `backbone` 和用于分类的 `cls_head` 。
+
+`data_preprocessor` 的实现如下:
+
+```python
+import torch
+from mmengine.model import BaseDataPreprocessor, stack_batch
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class DataPreprocessorZelda(BaseDataPreprocessor):
+    def __init__(self, mean, std):
+        super().__init__()
+
+        self.register_buffer(
+            'mean',
+            torch.tensor(mean, dtype=torch.float32).view(-1, 1, 1, 1),
+            False)
+        self.register_buffer(
+            'std',
+            torch.tensor(std, dtype=torch.float32).view(-1, 1, 1, 1),
+            False)
+
+    def forward(self, data, training=False):
+        data = self.cast_data(data)
+        inputs = data['inputs']
+        batch_inputs = stack_batch(inputs)  # 批处理
+        batch_inputs = (batch_inputs - self.mean) / self.std  # 归一化
+        data['inputs'] = batch_inputs
+        return data
+```
+
+以下是 data_preprocessor 的用法：将从[步骤2：构建一个数据集和数据加载器](#步骤2：构建一个数据集和数据加载器)中获得的 `batched_packed_results` 提供给 `data_preprocessor` 进行批处理和归一化。
+
+```python
+from mmaction.registry import MODELS
+
+data_preprocessor_cfg = dict(
+    type='DataPreprocessorZelda',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375])
+
+data_preprocessor = MODELS.build(data_preprocessor_cfg)
+
+preprocessed_inputs = data_preprocessor(batched_packed_results)
+print(preprocessed_inputs['inputs'].shape)
+```
+
+```
+torch.Size([2, 1, 3, 16, 224, 224])
+```
+
+`backbone`、`cls_head` 和 `recognizer` 的实现如下:
+
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModel, BaseModule, Sequential
+from mmengine.structures import LabelData
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class BackBoneZelda(BaseModule):
+    def __init__(self, init_cfg=None):
+        if init_cfg is None:
+            init_cfg = [dict(type='Kaiming', layer='Conv3d', mode='fan_out', nonlinearity="relu"),
+                        dict(type='Constant', layer='BatchNorm3d', val=1, bias=0)]
+
+        super(BackBoneZelda, self).__init__(init_cfg=init_cfg)
+
+        self.conv1 = Sequential(nn.Conv3d(3, 64, kernel_size=(3, 7, 7),
+                                          stride=(1, 2, 2), padding=(1, 3, 3)),
+                                nn.BatchNorm3d(64), nn.ReLU())
+        self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2),
+                                    padding=(0, 1, 1))
+
+        self.conv = Sequential(nn.Conv3d(64, 128, kernel_size=3, stride=2, padding=1),
+                               nn.BatchNorm3d(128), nn.ReLU())
+
+    def forward(self, imgs):
+        # imgs: [batch_size*num_views, 3, T, H, W]
+        # features: [batch_size*num_views, 128, T/2, H//8, W//8]
+        features = self.conv(self.maxpool(self.conv1(imgs)))
+        return features
+
+
+@MODELS.register_module()
+class ClsHeadZelda(BaseModule):
+    def __init__(self, num_classes, in_channels, dropout=0.5, average_clips='prob', init_cfg=None):
+        if init_cfg is None:
+            init_cfg = dict(type='Normal', layer='Linear', std=0.01)
+
+        super(ClsHeadZelda, self).__init__(init_cfg=init_cfg)
+
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.average_clips = average_clips
+
+        if dropout != 0:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+        self.fc = nn.Linear(self.in_channels, self.num_classes)
+        self.pool = nn.AdaptiveAvgPool3d(1)
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, x):
+        N, C, T, H, W = x.shape
+        x = self.pool(x)
+        x = x.view(N, C)
+        assert x.shape[1] == self.in_channels
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        cls_scores = self.fc(x)
+        return cls_scores
+
+    def loss(self, feats, data_samples):
+        cls_scores = self(feats)
+        labels = torch.stack([x.gt_label for x in data_samples])
+        labels = labels.squeeze()
+
+        if labels.shape == torch.Size([]):
+            labels = labels.unsqueeze(0)
+
+        loss_cls = self.loss_fn(cls_scores, labels)
+        return dict(loss_cls=loss_cls)
+
+    def predict(self, feats, data_samples):
+        cls_scores = self(feats)
+        num_views = cls_scores.shape[0] // len(data_samples)
+        # assert num_views == data_samples[0].num_clips
+        cls_scores = self.average_clip(cls_scores, num_views)
+
+        for ds, sc in zip(data_samples, cls_scores):
+            pred = LabelData(item=sc)
+            ds.pred_scores = pred
+        return data_samples
+
+    def average_clip(self, cls_scores, num_views):
+          if self.average_clips not in ['score', 'prob', None]:
+            raise ValueError(f'{self.average_clips} is not supported. '
+                             f'Currently supported ones are '
+                             f'["score", "prob", None]')
+
+          total_views = cls_scores.shape[0]
+          cls_scores = cls_scores.view(total_views // num_views, num_views, -1)
+
+          if self.average_clips is None:
+              return cls_scores
+          elif self.average_clips == 'prob':
+              cls_scores = F.softmax(cls_scores, dim=2).mean(dim=1)
+          elif self.average_clips == 'score':
+              cls_scores = cls_scores.mean(dim=1)
+
+          return cls_scores
+
+
+@MODELS.register_module()
+class RecognizerZelda(BaseModel):
+    def __init__(self, backbone, cls_head, data_preprocessor):
+        super().__init__(data_preprocessor=data_preprocessor)
+
+        self.backbone = MODELS.build(backbone)
+        self.cls_head = MODELS.build(cls_head)
+
+    def extract_feat(self, inputs):
+        inputs = inputs.view((-1, ) + inputs.shape[2:])
+        return self.backbone(inputs)
+
+    def loss(self, inputs, data_samples):
+        feats = self.extract_feat(inputs)
+        loss = self.cls_head.loss(feats, data_samples)
+        return loss
+
+    def predict(self, inputs, data_samples):
+        feats = self.extract_feat(inputs)
+        predictions = self.cls_head.predict(feats, data_samples)
+        return predictions
+
+    def forward(self, inputs, data_samples=None, mode='tensor'):
+        if mode == 'tensor':
+            return self.extract_feat(inputs)
+        elif mode == 'loss':
+            return self.loss(inputs, data_samples)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples)
+        else:
+            raise RuntimeError(f'Invalid mode: {mode}')
+```
+
+`init_cfg` 用于模型权重初始化。有关模型权重初始化的更多信息，请参阅[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/initialize.html) 。上述模块的用法如下:
+
+```python
+import torch
+import copy
+from mmaction.registry import MODELS
+
+model_cfg = dict(
+    type='RecognizerZelda',
+    backbone=dict(type='BackBoneZelda'),
+    cls_head=dict(
+        type='ClsHeadZelda',
+        num_classes=2,
+        in_channels=128,
+        average_clips='prob'),
+    data_preprocessor = dict(
+        type='DataPreprocessorZelda',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]))
+
+model = MODELS.build(model_cfg)
+
+# 训练
+model.train()
+model.init_weights()
+data_batch_train = copy.deepcopy(batched_packed_results)
+data = model.data_preprocessor(data_batch_train, training=True)
+loss = model(**data, mode='loss')
+print('loss dict: ', loss)
+
+# 验证
+with torch.no_grad():
+    model.eval()
+    data_batch_test = copy.deepcopy(batched_packed_results)
+    data = model.data_preprocessor(data_batch_test, training=False)
+    predictions = model(**data, mode='predict')
+print('Label of Sample[0]', predictions[0].gt_label)
+print('Scores of Sample[0]', predictions[0].pred_score)
+```
+
+```shell
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.0.weight - torch.Size([64, 3, 3, 7, 7]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.0.bias - torch.Size([64]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.1.weight - torch.Size([64]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.1.bias - torch.Size([64]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.0.weight - torch.Size([128, 64, 3, 3, 3]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.0.bias - torch.Size([128]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.1.weight - torch.Size([128]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.1.bias - torch.Size([128]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+cls_head.fc.weight - torch.Size([2, 128]):
+NormalInit: mean=0, std=0.01, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+cls_head.fc.bias - torch.Size([2]):
+NormalInit: mean=0, std=0.01, bias=0
+
+loss dict:  {'loss_cls': tensor(0.6853, grad_fn=<NllLossBackward0>)}
+Label of Sample[0] tensor([0])
+Scores of Sample[0] tensor([0.5240, 0.4760])
+```
+
+## 步骤4：构建一个评估指标
+
+请注意，`OpenMMLab` 中的所有 `Metric` 类都必须继承自 `mmengine` 中的 `BaseMetric` 类，并实现抽象方法 `process` 和`compute_metrics`。有关评估的更多信息，请参阅[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html) 。
+
+```python
+import copy
+from collections import OrderedDict
+from mmengine.evaluator import BaseMetric
+from mmaction.evaluation import top_k_accuracy
+from mmaction.registry import METRICS
+
+
+@METRICS.register_module()
+class AccuracyMetric(BaseMetric):
+    def __init__(self, topk=(1, 5), collect_device='cpu', prefix='acc'):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.topk = topk
+
+    def process(self, data_batch, data_samples):
+        data_samples = copy.deepcopy(data_samples)
+        for data_sample in data_samples:
+            result = dict()
+            scores = data_sample['pred_score'].cpu().numpy()
+            label = data_sample['gt_label'].item()
+            result['scores'] = scores
+            result['label'] = label
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        eval_results = OrderedDict()
+        labels = [res['label'] for res in results]
+        scores = [res['scores'] for res in results]
+        topk_acc = top_k_accuracy(scores, labels, self.topk)
+        for k, acc in zip(self.topk, topk_acc):
+            eval_results[f'topk{k}'] = acc
+        return eval_results
+```
+
+```python
+from mmaction.registry import METRICS
+
+metric_cfg = dict(type='AccuracyMetric', topk=(1, 5))
+
+metric = METRICS.build(metric_cfg)
+
+data_samples = [d.to_dict() for d in predictions]
+
+metric.process(batched_packed_results, data_samples)
+acc = metric.compute_metrics(metric.results)
+print(acc)
+```
+
+```shell
+OrderedDict([('topk1', 0.5), ('topk5', 1.0)])
+```
+
+## 步骤5：使用本地 PyTorch 训练和测试
+
+```python
+import torch.optim as optim
+from mmengine import track_iter_progress
+
+
+device = 'cuda' # or 'cpu'
+max_epochs = 10
+
+optimizer = optim.Adam(model.parameters(), lr=0.01)
+
+for epoch in range(max_epochs):
+    model.train()
+    losses = []
+    for data_batch in track_iter_progress(train_data_loader):
+        data = model.data_preprocessor(data_batch, training=True)
+        loss_dict = model(**data, mode='loss')
+        loss = loss_dict['loss_cls']
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        losses.append(loss.item())
+
+    print(f'Epoch[{epoch}]: loss ', sum(losses) / len(train_data_loader))
+
+    with torch.no_grad():
+        model.eval()
+        for data_batch in track_iter_progress(val_data_loader):
+            data = model.data_preprocessor(data_batch, training=False)
+            predictions = model(**data, mode='predict')
+            data_samples = [d.to_dict() for d in predictions]
+            metric.process(data_batch, data_samples)
+
+        acc = metric.acc = metric.compute_metrics(metric.results)
+        for name, topk in acc.items():
+            print(f'{name}: ', topk)
+```
+
+## 步骤6：使用 MMEngine 训练和测试（推荐）
+
+关于训练和测试的更多细节，你可以参考[ MMAction2 教程](https://mmaction2.readthedocs.io/en/latest/user_guides/train_test.html) 。有关 `Runner` 的更多信息，请参阅[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html) 。
+
+```python
+from mmengine.runner import Runner
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1)
+val_cfg = dict(type='ValLoop')
+
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=0.01))
+
+runner = Runner(model=model_cfg, work_dir='./work_dirs/guide',
+                train_dataloader=train_dataloader_cfg,
+                train_cfg=train_cfg,
+                val_dataloader=val_dataloader_cfg,
+                val_cfg=val_cfg,
+                optim_wrapper=optim_wrapper,
+                val_evaluator=[metric_cfg],
+                default_scope='mmaction')
+runner.train()
+```
diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md
new file mode 100644
index 0000000000000000000000000000000000000000..91d95181fffd201beed2dbdd20d84942a71ee069
--- /dev/null
+++ b/docs/zh_cn/get_started/installation.md
@@ -0,0 +1,198 @@
+# 安装
+
+## 前置条件
+
+在本节中，我们将演示如何准备 PyTorch 相关的依赖环境。
+
+MMAction2 适用于 Linux、Windows 和 MacOS。它需要 Python 3.7+，CUDA 10.2+ 和 PyTorch 1.8+。
+
+```{note}
+如果您熟悉 PyTorch 并且已经安装了它，可以跳过这部分内容，直接转到[下一节](#installation)。否则，您可以按照以下步骤进行准备工作。
+```
+
+**第一步。** 从[官方网站](https://docs.conda.io/en/latest/miniconda.html)下载并安装 Miniconda。
+
+**第二步。** 创建一个 conda 环境并激活它。
+
+```shell
+conda create --name openmmlab python=3.8 -y
+conda activate openmmlab
+```
+
+**第三步。** 安装 PyTorch，按照[官方说明](https://pytorch.org/get-started/locally/)进行操作，例如：
+
+在 GPU 平台上：
+
+```shell
+conda install pytorch torchvision -c pytorch
+```
+
+```{warning}
+此命令将自动安装最新版本的 PyTorch 和 cudatoolkit，请确保它们与您的环境匹配。
+```
+
+在 CPU 平台上：
+
+```shell
+conda install pytorch torchvision cpuonly -c pytorch
+```
+
+## 最佳实践
+
+我们建议用户遵循我们的最佳实践来安装 MMAction2。然而，整个过程是高度可定制的。更多信息请参见[自定义安装](#customize-installation)部分。
+
+**第一步。** 使用 [MIM](https://github.com/open-mmlab/mim) 安装 [MMEngine](https://github.com/open-mmlab/mmengine)、[MMCV](https://github.com/open-mmlab/mmcv)、[MMDetection](https://github.com/open-mmlab/mmdetection)（可选）和 [MMPose](https://github.com/open-mmlab/mmpose)（可选）。
+
+```shell
+pip install -U openmim
+mim install mmengine
+mim install mmcv
+mim install mmdet
+mim install mmpose
+```
+
+**第二步。** 安装 MMAction2。
+
+根据您的需求，我们支持两种安装模式：
+
+- [从源代码构建 MMAction2（推荐）](#build-mmaction2-from-source)：您想在 MMAction2 框架上开发自己的动作识别任务或新功能。例如，添加新的数据集或新的模型。因此，您可以使用我们提供的所有工具。
+- [安装为 Python 包](#install-as-a-python-package)：您只想在项目中调用 MMAction2 的 API 或导入 MMAction2 的模块。
+
+### 从源代码构建 MMAction2
+
+在这种情况下，从源代码安装 mmaction2：
+
+```shell
+git clone https://github.com/open-mmlab/mmaction2.git
+cd mmaction2
+pip install -v -e .
+# "-v" 表示输出更多安装相关的信息
+# "-e" 表示以可编辑形式安装，这样可以在不重新安装的情况下，让本地修改直接生效。
+```
+
+可选地，如果您希望为 MMAction2 做出贡献或体验实验功能，请切换到 `dev-1.x` 分支：
+
+```shell
+git checkout dev-1.x
+```
+
+### 安装为 Python 包
+
+只需使用 pip 安装即可。
+
+```shell
+pip install mmaction2
+```
+
+## 验证安装
+
+为了验证 MMAction2 是否安装正确，我们提供了一些示例代码来运行推理演示。
+
+**第一步。** 下载配置文件和权重文件。
+
+```shell
+mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb --dest .
+```
+
+**第二步。** 验证推理演示。
+
+选项（a）。如果您是从源代码安装的 mmaction2，可以运行以下命令：
+
+```shell
+# demo.mp4 和 label_map_k400.txt 都来自于 Kinetics-400
+python demo/demo.py tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py \
+    tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth \
+    demo/demo.mp4 tools/data/kinetics/label_map_k400.txt
+```
+
+您将在终端看到前5个标签及其对应的分数。
+
+选项（b）。如果您将 mmaction2 安装为一个 Python 包，可以在 Python 解释器中运行以下代码，这将进行类似的验证：
+
+```python
+from operator import itemgetter
+from mmaction.apis import init_recognizer, inference_recognizer
+
+config_file = 'tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py'
+checkpoint_file = 'tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth'
+video_file = 'demo/demo.mp4'
+label_file = 'tools/data/kinetics/label_map_k400.txt'
+model = init_recognizer(config_file, checkpoint_file, device='cpu')  # or device='cuda:0'
+pred_result = inference_recognizer(model, video_file)
+
+pred_scores = pred_result.pred_score.tolist()
+score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))
+score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)
+top5_label = score_sorted[:5]
+
+labels = open(label_file).readlines()
+labels = [x.strip() for x in labels]
+results = [(labels[k[0]], k[1]) for k in top5_label]
+
+print('The top-5 labels with corresponding scores are:')
+for result in results:
+    print(f'{result[0]}: ', result[1])
+```
+
+## 自定义安装
+
+### CUDA 版本
+
+在安装 PyTorch 时，您可能需要指定 CUDA 的版本。如果您不确定选择哪个版本，请遵循我们的建议：
+
+- 对于 Ampere 架构的 NVIDIA GPU，例如 GeForce 30 series 以及 NVIDIA A100，CUDA 11 是必需的。
+- 对于更早的 NVIDIA GPU，CUDA 11 是向前兼容的，但 CUDA 10.2 能够提供更好的兼容性，也更加轻量。
+
+请确保 GPU 驱动程序满足最低版本要求。有关更多信息，请参见[此表格](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions)。
+
+```{note}
+如果按照我们的最佳实践进行安装，仅安装 CUDA 运行时库就足够了，因为不会在本地编译任何 CUDA 代码。然而，如果您希望从源代码编译 MMCV 或开发其他 CUDA 运算符，您需要从 NVIDIA 的[网站](https://developer.nvidia.com/cuda-downloads)安装完整的 CUDA 工具包，并且其版本应与 PyTorch 的 CUDA 版本匹配，即 `conda install` 命令中指定的 cudatoolkit 的版本。
+```
+
+### 不使用 MIM 安装 MMCV
+
+MMCV 包含 C++ 和 CUDA 扩展，因此它与 PyTorch 的关系比较复杂。MIM 可以自动解决这些依赖关系，使安装变得更加容易。但这不是必须的。
+
+如果您希望使用 pip 而不是 MIM 安装 MMCV，请参考[MMCV 安装指南](https://mmcv.readthedocs.io/en/latest/get_started/installation.html)。这需要手动指定基于 PyTorch 版本和其 CUDA 版本的 find-url。
+
+例如，以下命令安装了为 PyTorch 1.10.x 和 CUDA 11.3 构建的 mmcv。
+
+```shell
+pip install mmcv -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
+```
+
+### 在 CPU 环境中安装
+
+MMAction2 可以仅在 CPU 环境中安装。在 CPU 模式下，你可以完成训练、测试和模型推理等所有操作。
+
+在 CPU 模式下，MMCV 的部分功能将不可用，通常是一些 GPU 编译的算子。不过不用担心， MMAction2 中几乎所有的模型都不会依赖这些算子。
+
+### 通过 Docker 使用 MMAction2
+
+我们提供了一个[Dockerfile](https://github.com/open-mmlab/mmaction2/blob/main/docker/Dockerfile)来构建镜像。确保您的[docker 版本](https://docs.docker.com/engine/install/) >=19.03。
+
+```shell
+# 构建一个基于 PyTorch 1.6.0、CUDA 10.1 和 CUDNN 7 的镜像。
+# 如果您喜欢其他版本，请修改 Dockerfile。
+docker build -f ./docker/Dockerfile --rm -t mmaction2 .
+```
+
+使用以下命令运行它：
+
+```shell
+# 例如构建PyTorch 1.6.0, CUDA 10.1, CUDNN 7的镜像
+# 如果你喜欢其他版本,只要修改Dockerfile
+docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmaction2/data mmaction2
+```
+
+## 故障排除
+
+1. 当从旧版本 `0.x` 迁移到新版本 `1.x` 时，您可能会遇到依赖库版本不匹配的问题。下面是在按照上述安装过程执行后，通过 `pip list` 命令显示的每个依赖库的版本。请确保在终端中显示的每个依赖库版本都大于或等于（即 `>=`）下面每个依赖库的版本。
+
+```shell
+mmaction2                1.0.0
+mmcv                     2.0.0
+mmdet                    3.0.0
+mmengine                 0.7.2
+mmpose                   1.0.0
+```
diff --git a/docs/zh_cn/get_started/overview.md b/docs/zh_cn/get_started/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..74e72027e69b1157443ce81f3a81cddc1bb16e76
--- /dev/null
+++ b/docs/zh_cn/get_started/overview.md
@@ -0,0 +1,97 @@
+# 概述
+
+## 什么是 MMAction2
+
+MMAction2 是一个基于 PyTorch 的开源工具包，支持了大量的视频理解模型，包括**行为识别、基于骨架的行为识别、时空行为检测和时序动作定位**等多个主要方向。它还支持了大多数流行的学术数据集，并提供了许多实用工具帮助用户对数据集和模型进行多方面的探索和调试。它具有以下特点：
+
+**全流程，多模型**：MMAction2 支持各种视频理解任务，实现了最先进的行为识别、定位、检测模型。
+
+**模块化设计**：MMAction2 的模块化设计使用户可以根据需要定义和重用模型中的模块。
+
+**实用工具众多**：MMAction2 提供了一系列的分析工具，如可视化器、验证脚本、评估器等，以帮助用户进行故障排除、微调或比较模型。
+
+**由 OpenMMLab 强力驱动**：与家族内的其它算法库一样，MMAction2 遵循着 OpenMMLab 严谨的开发准则和接口约定，极大地降低了用户切换各算法库时的学习成本。同时，MMAction2 也可以非常便捷地与家族内其他算法库跨库联动，从而满足用户跨领域研究和落地的需求。
+
+<table><tr>
+  <td><img src="https://github.com/open-mmlab/mmaction2/raw/main/resources/mmaction2_overview.gif" width="380px">
+    <p style="text-align: center;">行为识别</p></td>
+  <td><img src="https://user-images.githubusercontent.com/34324155/123989146-2ecae680-d9fb-11eb-916b-b9db5563a9e5.gif" width="380px"><br>
+    <p style="text-align: center;">基于骨架的行为识别</p></td>
+</table></tr>
+<table><tr>
+  <td><img src="https://user-images.githubusercontent.com/30782254/155710881-bb26863e-fcb4-458e-b0c4-33cd79f96901.gif" width="380px">
+    <p style="text-align: center;">时空动作检测</p></td>
+  <td><img src="https://github.com/open-mmlab/mmaction2/raw/main/resources/spatio-temporal-det.gif" width="380px"><br>
+    <p style="text-align: center;">时空动作检测</p></td>
+</table></tr>
+
+## 如何使用文档
+
+针对不同类型的用户，我们准备了详细的指南：
+
+<details open>
+<summary><b> MMAction2 的基础用法</b></summary>
+
+- [安装](installation.md)
+- [快速运行](quick_run.md)
+- [利用现有模型进行推理](../user_guides/inference.md)
+
+</details>
+
+<details open>
+<summary><b>关于在已支持的数据集上进行训练</b></summary>
+
+- [了解配置文件](../user_guides/config.md)
+- [准备数据集](../user_guides/prepare_dataset.md)
+- [训练与测试](../user_guides/train_test.md)
+
+</details>
+
+<details open>
+<summary><b>关于使用过程中的常见问题</b></summary>
+
+- [常见问题解答](faq.md)
+- [有用的工具](../useful_tools.md)
+
+</details>
+
+<details open>
+<summary><b>关于 MMAction2 的框架设计</b></summary>
+
+- [20分钟 MMAction2 框架指南](guide_to_framework.md)
+- [MMAction2 中的数据流](../advanced_guides/dataflow.md)
+
+</details>
+
+<details open>
+<summary><b>关于自定义训练的高级用法</b></summary>
+
+- [自定义模型](../advanced_guides/customize_models.md)
+- [自定义数据集](../advanced_guides/customize_dataset.md)
+- [自定义数据管道](../advanced_guides/customize_pipeline.md)
+- [自定义优化器](../advanced_guides/customize_optimizer.md)
+- [自定义日志记录](../advanced_guides/customize_logging.md)
+
+</details>
+
+<details open>
+<summary><b>关于支持的模型库和数据集</b></summary>
+
+- [模型库](../modelzoo_statistics.md)
+- [数据集](../datasetzoo_statistics.md)
+
+</details>
+
+<details open>
+<summary><b>关于从 MMAction2 0.x 迁移</b></summary>
+
+- [从 MMAction2 0.x 迁移](../migration.md)
+
+</details>
+
+<details open>
+<summary><b>对于希望加入开源社区，向 MMAction2 贡献代码的研究者和开发者</b></summary>
+
+- [如何为 MMAction2 做出贡献](contribution_guide.md)
+
+</details>
diff --git a/docs/zh_cn/get_started/quick_run.md b/docs/zh_cn/get_started/quick_run.md
new file mode 100644
index 0000000000000000000000000000000000000000..b7faa7f92f2180ec0bfbfad0579de371bfd51d1f
--- /dev/null
+++ b/docs/zh_cn/get_started/quick_run.md
@@ -0,0 +1,219 @@
+# 快速运行
+
+本章将介绍 MMAction2 的基本功能。我们假设你已经[源码安装 MMAction2](installation.md#best-practices)。
+
+- [快速运行](#快速运行)
+  - [推理](#推理)
+  - [准备数据集](#准备数据集)
+  - [修改配置](#修改配置)
+    - [修改数据集](#修改数据集)
+    - [修改运行配置](#修改运行配置)
+    - [修改模型配置](#修改模型配置)
+  - [浏览数据集](#浏览数据集)
+  - [训练](#训练)
+  - [测试](#测试)
+
+## 推理
+
+在 MMAction2 的根目录下执行如下命令:
+
+```shell
+python demo/demo_inferencer.py  demo/demo.mp4 \
+    --rec tsn --print-result \
+    --label-file tools/data/kinetics/label_map_k400.txt
+```
+
+您应该能够看到弹出的视频窗口，和在控制台中打印的推断结果。
+
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/33249023/227216933-29b84ac7-ca0e-408d-b4d2-5a2e5a7357bf.gif" height="250"/>
+</div>
+<br />
+
+```bash
+# 推理结果
+{'predictions': [{'rec_labels': [[6]], 'rec_scores': [[...]]}]}
+```
+
+```{note}
+如果您在没有 GUI 的服务器上运行 MMAction2，或者通过禁用 X11 转发的 SSH 隧道运行 MMAction2，则可能不会看到弹出窗口。
+```
+
+关于 MMAction2 推理接口的详细描述可以在[这里](/demo/README.md#inferencer)找到.
+
+除了使用我们提供的预训练模型，您还可以在自己的数据集上训练模型。在下一节中，我们将通过在精简版 [Kinetics](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) 数据集上训练 TSN 为例，带您了解 MMAction2 的基本功能。
+
+## 准备数据集
+
+由于视频数据集格式的多样性不利于数据集的切换，MMAction2 提出了统一的[数据格式](../user_guides/prepare_dataset.md) ，并为常用的视频数据集提供了[数据集准备指南](../user_guides/data_prepare/dataset_prepare.md)。通常，要在 MMAction2 中使用这些数据集，你只需要按照步骤进行准备。
+
+```{笔记}
+但在这里，效率意味着一切。
+```
+
+首先，请下载我们预先准备好的 [kinetics400_tiny.zip](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) ，并将其解压到 MMAction2 根目录下的 `data/` 目录。这将为您提供必要的视频和注释文件。
+
+```Bash
+wget https://download.openmmlab.com/mmaction/kinetics400_tiny.zip
+mkdir -p data/
+unzip kinetics400_tiny.zip -d data/
+```
+
+## 修改配置
+
+准备好数据集之后，下一步是修改配置文件，以指定训练集和训练参数的位置。
+
+在本例中，我们将使用 resnet50 作为主干网络来训练 TSN。由于 MMAction2 已经有了完整的 Kinetics400 数据集的配置文件 (`configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`)，我们只需要在其基础上进行一些修改。
+
+### 修改数据集
+
+我们首先需要修改数据集的路径。打开 `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` ，按如下替换关键字:
+
+```Python
+data_root = 'data/kinetics400_tiny/train'
+data_root_val = 'data/kinetics400_tiny/val'
+ann_file_train = 'data/kinetics400_tiny/kinetics_tiny_train_video.txt'
+ann_file_val = 'data/kinetics400_tiny/kinetics_tiny_val_video.txt'
+```
+
+### 修改运行配置
+
+此外，由于数据集的大小减少，我们建议将训练批大小减少到4个，训练epoch的数量相应减少到10个。此外，我们建议将验证和权值存储间隔缩短为1轮，并修改学习率衰减策略。修改 `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` 中对应的关键字，如下所示生效。
+
+```python
+# 设置训练批大小为 4
+train_dataloader['batch_size'] = 4
+
+# 每轮都保存权重，并且只保留最新的权重
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1))
+# 将最大 epoch 数设置为 10，并每 1 个 epoch验证模型
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1)
+#根据 10 个 epoch调整学习率调度
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=10,
+        by_epoch=True,
+        milestones=[4, 8],
+        gamma=0.1)
+]
+```
+
+### 修改模型配置
+
+此外，由于精简版 Kinetics 数据集规模较小，建议加载原始 Kinetics 数据集上的预训练模型。此外，模型需要根据实际类别数进行修改。请直接将以下代码添加到 `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` 中。
+
+```python
+model = dict(
+    cls_head=dict(num_classes=2))
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'
+```
+
+在这里，我们直接通过继承 ({external+mmengine:doc} `MMEngine: Config <advanced_tutorials/ Config>`) 机制重写了基本配置中的相应参数。原始字段分布在 `configs/_base_/models/tsn_r50.py`、`configs/_base_/schedules/sgd_100e.py` 和 `configs/_base_/default_runtime.py`中。
+
+```{note}
+关于配置的更详细的描述，请参考[这里](../user_guides/config.md)。
+```
+
+## 浏览数据集
+
+在开始训练之前，我们还可以将训练时数据转换处理的帧可视化。这很简单：传递我们需要可视化的配置文件到 [browse_dataset.py](/tools/analysis_tools/browse_dataset.py)脚本中。
+
+```Bash
+python tools/visualizations/browse_dataset.py \
+    configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \
+    browse_out --mode pipeline
+```
+
+转换后的视频将被保存到 `browse_out` 文件夹中。
+
+<center class="half">
+    <img src="https://user-images.githubusercontent.com/33249023/227452030-81895695-8a9b-45be-922a-3d9d86baf65d.gif" height="250"/>
+</center>
+
+```{note}
+有关该脚本的参数和使用方法的详细信息，请参考[这里](../user_guides/useful_tools.md)。
+```
+
+```{tip}
+除了满足我们的好奇心，可视化还可以帮助我们在训练前检查可能影响模型性能的部分，例如配置、数据集和数据转换中的问题。
+```
+
+我们可以通过以下脚本进一步可视化学习率调度，以确保配置符合预期:
+
+```Bash
+python tools/visualizations/vis_scheduler.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+训练学习率时间表将显示在弹出窗口中。
+
+<center class="half">
+    <img src="https://user-images.githubusercontent.com/33249023/227502329-6fd44259-e23b-46e0-8e19-29f9b664f4e2.png" height="250"/>
+</center>
+
+```{note}
+学习率根据实际批数据大小自动缩放。
+```
+
+## 训练
+
+运行如下命令启动训练:
+
+```Bash
+python tools/train.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+根据系统环境，MMAction2 将自动使用最佳设备进行训练。如果有GPU，则默认启动单个GPU训练。当你开始看到 loss 的输出时，就说明你已经成功启动了训练。
+
+```Bash
+03/24 16:36:15 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:15 - mmengine - INFO - Epoch(train)  [1][8/8]  lr: 1.5625e-04  eta: 0:00:15  time: 0.2151  data_time: 0.0845  memory: 1314  grad_norm: 8.5647  loss: 0.7267  top1_acc: 0.0000  top5_acc: 1.0000  loss_cls: 0.7267
+03/24 16:36:16 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:16 - mmengine - INFO - Epoch(train)  [2][8/8]  lr: 1.5625e-04  eta: 0:00:12  time: 0.1979  data_time: 0.0717  memory: 1314  grad_norm: 8.4709  loss: 0.7130  top1_acc: 0.0000  top5_acc: 1.0000  loss_cls: 0.7130
+03/24 16:36:18 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:18 - mmengine - INFO - Epoch(train)  [3][8/8]  lr: 1.5625e-04  eta: 0:00:10  time: 0.1691  data_time: 0.0478  memory: 1314  grad_norm: 8.2910  loss: 0.6900  top1_acc: 0.5000  top5_acc: 1.0000  loss_cls: 0.6900
+03/24 16:36:18 - mmengine - INFO - Saving checkpoint at 3 epochs
+03/24 16:36:19 - mmengine - INFO - Epoch(val) [3][1/1]  acc/top1: 0.9000  acc/top5: 1.0000  acc/mean1: 0.9000data_time: 1.2716  time: 1.3658
+03/24 16:36:20 - mmengine - INFO - The best checkpoint with 0.9000 acc/top1 at 3 epoch is saved to best_acc/top1_epoch_3.pth.
+```
+
+在没有额外配置的情况下，模型权重将被保存到 `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/`，而日志将被存储到 `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/`。接下来，我们只需要耐心等待训练完成。
+
+```{note}
+训练的高级用法，如 CPU 训练、多卡训练及集群训练，请参考[training and Testing](../user_guides/train_test.md)
+```
+
+## 测试
+
+经过 10 个 epoch 后，我们观察到 TSN 在第 6 个 epoch 表现最好，`acc/top1` 达到1.0000:
+
+```Bash
+03/24 16:36:25 - mmengine - INFO - Epoch(val) [6][1/1]  acc/top1: 1.0000  acc/top5: 1.0000  acc/mean1: 1.0000data_time: 1.0210  time: 1.1091
+```
+
+```{note}
+由于在原始 Kinetics400 上进行了预训练，结果非常高，您可能会看到不同的结果
+```
+
+然而，该值仅反映了 TSN 在精简版 Kinetics 数据集上的验证性能，而测试结果通常更高，因为在测试数据流水线中增加了更多的数据增强。
+
+开始测试：
+
+```Bash
+python tools/test.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \
+    work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/best_acc/top1_epoch_6.pth
+```
+
+并得到如下输出:
+
+```Bash
+03/24 17:00:59 - mmengine - INFO - Epoch(test) [10/10]  acc/top1: 1.0000  acc/top5: 1.0000  acc/mean1: 0.9000data_time: 0.0420  time: 1.0795
+```
+
+该模型在该数据集上实现了 1.000 的 top1 准确率。
+
+```{note}
+测试的高级用法，如CPU测试、多gpu测试、集群测试，请参考[Training and testing](../user_guides/train_test.md)
+```
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ab1afe741606f68ed26f43b79ec186d0db47dd91
--- /dev/null
+++ b/docs/zh_cn/index.rst
@@ -0,0 +1,96 @@
+欢迎来到 MMAction2 中文教程!
+=====================================
+
+You can switch between Chinese and English documents in the lower-left corner of the layout.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 新手入门
+
+   get_started/overview.md
+   get_started/installation.md
+   get_started/quick_run.md
+   get_started/guide_to_framework.md
+   get_started/contribution_guide.md
+   get_started/faq.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 用户指南
+
+   user_guides/inference.md
+   user_guides/config.md
+   user_guides/train_test.md
+   user_guides/prepare_dataset.md
+   user_guides/finetune.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 进阶教程
+
+   advanced_guides/dataflow.md
+   advanced_guides/customize_models.md
+   advanced_guides/customize_dataset.md
+   advanced_guides/customize_pipeline.md
+   advanced_guides/customize_optimizer.md
+   advanced_guides/customize_logging.md
+   advanced_guides/deploy.md
+   useful_tools.md
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 模型支持
+
+   modelzoo_statistics.md
+   model_zoo/recognition.md
+   model_zoo/recognition_audio.md
+   model_zoo/skeleton.md
+   model_zoo/detection.md
+   model_zoo/retrieval.md
+   model_zoo/localization.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 数据集支持
+   :glob:
+
+   datasetzoo_statistics.md
+   dataset_zoo/*
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 相关项目
+
+   projectzoo.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: MMAction2 0.x 迁移指南
+
+   migration.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: API 参考文档
+
+   api.rst
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 其他说明
+
+   notes/ecosystem.md
+
+.. toctree::
+   :caption: 切换语言
+
+   switch_language.md
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/zh_cn/make.bat b/docs/zh_cn/make.bat
new file mode 100644
index 0000000000000000000000000000000000000000..2119f51099bf37e4fdb6071dce9f451ea44c62dd
--- /dev/null
+++ b/docs/zh_cn/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/zh_cn/migration.md b/docs/zh_cn/migration.md
new file mode 100644
index 0000000000000000000000000000000000000000..37c5fb3172f369de1db6046f292eb44dc5ddbc2d
--- /dev/null
+++ b/docs/zh_cn/migration.md
@@ -0,0 +1,488 @@
+# 从 MMAction2 0.x 迁移
+
+MMAction2 1.x 引入了一些重构和修改，包括一些向后不兼容的更改。我们提供这个教程，帮助您从 MMAction2 0.x 迁移您的项目。
+
+## 新的依赖项
+
+MMAction2 1.x 依赖于以下库。建议您准备一个新的运行环境，并根据[安装教程](./get_started/installation.md)进行安装。
+
+1. [MMEngine](https://github.com/open-mmlab/mmengine)：MMEngine 是引入于 OpenMMLab 2.0 架构中的用于训练深度学习模型的基础库。
+2. [MMCV](https://github.com/open-mmlab/mmcv)：MMCV 是用于计算机视觉的基础库。MMAction2 1.x 需要 `mmcv>=2.0.0`，它比 `mmcv-full==2.0.0` 更紧凑和高效。
+
+## 配置文件
+
+在 MMAction2 1.x 中，我们重构了配置文件的结构。旧风格的配置文件将不兼容。
+
+在本节中，我们将介绍配置文件的所有更改。我们假设您已经熟悉[配置文件](./user_guides/config.md)。
+
+### 模型设置
+
+`model.backbone` 和 `model.neck` 没有更改。对于 `model.cls_head`，我们将 `average_clips` 移到其中，原本设置在 `model.test_cfg` 中。
+
+### 数据设置
+
+#### **`data`** 中的更改
+
+- 原始的 `data` 字段被拆分为 `train_dataloader`、`val_dataloader` 和 `test_dataloader`。这样可以对它们进行细粒度的配置。例如，您可以在训练和测试过程中指定不同的采样器和批大小。
+- `videos_per_gpu` 改名为 `batch_size`。
+- `workers_per_gpu` 改名为 `num_workers`。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+data = dict(
+    videos_per_gpu=32,
+    workers_per_gpu=2,
+    train=dict(...),
+    val=dict(...),
+    test=dict(...),
+)
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=2,
+    dataset=dict(...),
+    sampler=dict(type='DefaultSampler', shuffle=True)  # 必要
+)
+
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=2,
+    dataset=dict(...),
+    sampler=dict(type='DefaultSampler', shuffle=False)  # 必要
+)
+
+test_dataloader = val_dataloader
+```
+
+</td>
+</tr>
+</table>
+
+#### **`pipeline`** 中的更改
+
+- 原来的格式化变换 **`ToTensor`**、**`Collect`** 被合并为 `PackActionInputs`。
+- 我们不建议在数据集流水线中进行 **`Normalize`**。请从流水线中移除它，并在 `model.data_preprocessor` 字段中设置。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+model.data_preprocessor = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=5),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+```
+
+</td>
+</tr>
+</table>
+
+#### **`evaluation`** 中的更改
+
+- **`evaluation`** 字段被拆分为 `val_evaluator` 和 `test_evaluator`。不再支持 `interval` 和 `save_best` 参数。
+- `interval` 移到 `train_cfg.val_interval`，`save_best` 移到 `default_hooks.checkpoint.save_best`。
+- 'mean_average_precision'、'mean_class_accuracy'、'mmit_mean_average_precision'、'top_k_accuracy' 被合并为 `AccMetric`，您可以使用 `metric_list` 指定要计算的指标。
+- `AVAMetric` 用于评估 AVA 数据集。
+- `ANetMetric` 用于评估 ActivityNet 数据集。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+evaluation = dict(
+    interval=5,
+    metrics=['top_k_accuracy', 'mean_class_accuracy'])
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+val_evaluator = dict(
+    type='AccMetric',
+    metric_list=('top_k_accuracy', 'mean_class_accuracy'))
+test_evaluator = val_evaluator
+```
+
+</td>
+</tr>
+</table>
+
+### 学习率策略设置
+
+#### **`optimizer`** 和 **`optimizer_config`** 中的更改
+
+- 现在我们使用 `optim_wrapper` 字段来配置优化过程。`optimizer` 成为 `optim_wrapper` 的子字段。
+- `paramwise_cfg` 也是 `optim_wrapper` 的子字段，与 `optimizer` 平行。
+- 现在已删除 `optimizer_config`，其中的所有配置都移动到 `optim_wrapper`。
+- `grad_clip` 改名为 `clip_grad`。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+optimizer = dict(
+    type='AdamW',
+    lr=0.0015,
+    weight_decay=0.3,
+    paramwise_cfg = dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+    ))
+
+optimizer_config = dict(grad_clip=dict(max_norm=1.0))
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=0.0015, weight_decay=0.3),
+    paramwise_cfg = dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+    ),
+    clip_gard=dict(max_norm=1.0),
+)
+```
+
+</td>
+</tr>
+</table>
+
+#### **`lr_config`** 中的更改
+
+- 删除了 `lr_config` 字段，我们使用新的 `param_scheduler` 来替代它。
+- 删除了与 warmup 相关的参数，因为我们使用策略组合来实现这个功能。
+
+新的组合机制非常灵活，您可以使用它来设计多种学习率/动量曲线。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+lr_config = dict(
+    policy='CosineAnnealing',
+    min_lr=0,
+    warmup='linear',
+    warmup_iters=5,
+    warmup_ratio=0.01,
+    warmup_by_epoch=True)
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+param_scheduler = [
+    # 学习率预热
+    dict(
+        type='LinearLR',
+        start_factor=0.01,
+        by_epoch=True,
+        end=5,
+        # 在每个迭代后更新学习率。
+        convert_to_iter_based=True),
+    # 主要的学习率策略
+    dict(type='CosineAnnealingLR', by_epoch=True, begin=5),
+]
+```
+
+</td>
+</tr>
+</table>
+
+#### **`runner`** 中的更改
+
+原始 `runner` 字段中的大多数配置已移至 `train_cfg`、`val_cfg` 和 `test_cfg`，用于配置训练、验证和测试的循环。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+runner = dict(type='EpochBasedRunner', max_epochs=100)
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+# `val_interval` 是原 `evaluation.interval`。
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')   # 使用默认验证循环。
+test_cfg = dict(type='TestLoop')  # 使用默认测试循环。
+```
+
+</td>
+</tr>
+</table>
+
+事实上，在 OpenMMLab 2.0 中，我们引入了 `Loop` 来控制训练、验证和测试的行为。`Runner` 的功能也发生了变化。您可以在[MMEngine 教程](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html)中找到更多详细信息。
+
+### 运行时设置
+
+#### **`checkpoint_config`** 和 **`log_config`** 中的更改
+
+`checkpoint_config` 移到 `default_hooks.checkpoint`，`log_config` 移到 `default_hooks.logger`。我们将许多钩子的设置从脚本代码中移动到运行时配置的 `default_hooks` 字段中。
+
+```python
+default_hooks = dict(
+    # 更新运行时信息，如当前迭代和学习率。
+    runtime_info=dict(type='RuntimeInfoHook'),
+
+    # 记录每个迭代的时间。
+    timer=dict(type='IterTimerHook'),
+
+    # 每 100 次迭代打印日志。
+    logger=dict(type='LoggerHook', interval=100),
+
+    # 启用参数策略器。
+    param_scheduler=dict(type='ParamSchedulerHook'),
+
+    # 每个 epoch 保存一次权重，并自动保存最佳权重。
+    checkpoint=dict(type='CheckpointHook', interval=1, save_best='auto'),
+
+    # 在分布式环境中设置采样器种子。
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+
+    # 在每个 epoch 结束时同步模型缓冲区。
+    sync_buffers=dict(type='SyncBuffersHook')
+)
+```
+
+此外，我们将原来的 logger 拆分为 logger 和 visualizer。logger 用于记录信息，visualizer 用于在不同的后端（如终端、TensorBoard 和 Wandb）中显示 logger。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook'),
+    ])
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+default_hooks = dict(
+    ...
+    logger=dict(type='LoggerHook', interval=100),
+)
+
+visualizer = dict(
+    type='ActionVisualizer',
+    vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')],
+)
+```
+
+</td>
+</tr>
+</table>
+
+#### **`load_from`** 和 **`resume_from`** 中的更改
+
+- 删除了 `resume_from`。现在我们使用 `resume` 和 `load_from` 来替代它。
+  - 如果 `resume=True` 并且 `load_from` 不为 None，则从 `load_from` 中的权重恢复训练。
+  - 如果 `resume=True` 并且 `load_from` 为 None，则尝试从工作目录中的最新权重恢复。
+  - 如果 `resume=False` 并且 `load_from` 不为 None，则只加载权重文件，不恢复训练。
+  - 如果 `resume=False` 并且 `load_from` 为 None，则既不加载也不恢复。
+
+#### **`dist_params`** 中的更改
+
+`dist_params` 字段现在是 `env_cfg` 的子字段。`env_cfg` 中还有一些新的配置。
+
+```python
+env_cfg = dict(
+    # 是否启用 cudnn benchmark
+    cudnn_benchmark=False,
+
+    # 设置多进程参数
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+
+    # 设置分布式参数
+    dist_cfg=dict(backend='nccl'),
+)
+```
+
+#### **`workflow`** 中的更改
+
+删除了与 `workflow` 相关的功能。
+
+#### 新字段 **`visualizer`**
+
+visualizer 是 OpenMMLab 2.0 架构中的新设计。我们在 runner 中使用一个 visualizer 实例来处理结果和日志的可视化，并保存到不同的后端，如终端、TensorBoard 和 Wandb。
+
+```python
+visualizer = dict(
+    type='ActionVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        # 取消下面一行的注释，将日志和可视化结果保存到 TensorBoard。
+        # dict(type='TensorboardVisBackend')
+    ]
+)
+```
+
+#### 新字段 **`default_scope`**
+
+所有注册表在不同包中的定义已移动到 `mmaction.registry` 包中。
+
+## Packages
+
+### `mmaction.apis`
+
+文档可以在[这里](mmaction.apis)找到。
+
+|          函数          |                     更改                     |
+| :--------------------: | :------------------------------------------: |
+|   `init_recognizer`    |                   无需更改                   |
+| `inference_recognizer` |                   无需更改                   |
+|     `train_model`      |      删除，使用 `runner.train` 进行训练      |
+|    `multi_gpu_test`    |      删除，使用 `runner.test` 进行测试       |
+|   `single_gpu_test`    |      删除，使用 `runner.test` 进行测试       |
+|   `set_random_seed`    | 删除，使用 `mmengine.runner.set_random_seed` |
+|   `init_random_seed`   | 删除，使用 `mmengine.dist.sync_random_seed`  |
+
+### `mmaction.core`
+
+`mmaction.core` 包已被重命名为 [`mmaction.engine`](mmaction.engine)。
+
+|     子包     |                           更改                            |
+| :----------: | :-------------------------------------------------------: |
+| `evaluation` |         删除，使用 `mmaction.evaluation` 中的指标         |
+|   `hooks`    |              移动到 `mmaction.engine.hooks`               |
+| `optimizer`  |            移动到 `mmaction.engine.optimizers`            |
+|   `utils`    | 删除，分布式环境相关的函数可以在 `mmengine.dist` 包中找到 |
+
+### `mmaction.datasets`
+
+文档可以在[这里](mmaction.datasets)找到。
+
+#### [`BaseActionDataset`](mmaction.datasets.BaseActionDataset) 中的更改：
+
+|          方法          |                    更改                     |
+| :--------------------: | :-----------------------------------------: |
+| `prepare_train_frames` |           由 `get_data_info` 替换           |
+| `preprare_test_frames` |           由 `get_data_info` 替换           |
+|       `evaluate`       |  删除，使用 `mmengine.evaluator.Evaluator`  |
+|     `dump_results`     | 删除，使用 `mmengine.evaluator.DumpResults` |
+|   `load_annotations`   |           替换为 `load_data_list`           |
+
+现在，您可以编写一个继承自 `BaseActionDataset` 的新 Dataset 类，并仅重写 `load_data_list`。要加载更多的数据信息，您可以像 `RawframeDataset` 和 `AVADataset` 那样重写 `get_data_info`。
+`mmaction.datasets.pipelines` 被重命名为 `mmaction.datasets.transforms`，`mmaction.datasets.pipelines.augmentations` 被重命名为 `mmaction.datasets.pipelines.processing`。
+
+### `mmaction.models`
+
+文档可以在[这里](mmaction.models)找到。所有 **backbones**、**necks** 和 **losses** 的接口没有更改。
+
+[`BaseRecognizer`](mmaction.models.BaseRecognizer) 中的更改：
+
+|      方法       |                                                              更改                                                              |
+| :-------------: | :----------------------------------------------------------------------------------------------------------------------------: |
+| `extract_feat`  | 增强的方法，现在支持三个阶段（`backbone`、`neck`、`head`）的输出特征，并且可以处理不同的模式，如 `train_mode` 和 `test_mode`。 |
+|    `forward`    |         现在只接受三个参数：`inputs`、`data_samples` 和 `mode`。详细信息请参阅[文档](mmaction.models.BaseRecognizer)。         |
+| `forward_train` |                                                       已替换为 `loss`。                                                        |
+| `forward_test`  |                                                      已替换为 `predict`。                                                      |
+|  `train_step`   |                `optimizer` 参数被替换为 `optim_wrapper`，它接受 [`OptimWrapper`](mmengine.optim.OptimWrapper)。                |
+|   `val_step`    |                                    原 `val_step` 与 `train_step` 相同，现在调用 `predict`。                                    |
+|   `test_step`   |                                                  新方法，与 `val_step` 相同。                                                  |
+
+[BaseHead](mmaction.models.BaseHead) 中的更改：
+
+|   方法    |                                                                              更改                                                                              |
+| :-------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| `forward` |                                                                            无需更改                                                                            |
+|  `loss`   | 接受 `feats` 和 `data_samples`，而不是 `cls_score` 和 `labels` 来计算损失。`data_samples` 是 [ActionDataSample](mmaction.structures.ActionDataSample) 的列表。 |
+| `predict` |                                                        接受 `feats` 和 `data_samples` 来预测分类分数。                                                         |
+
+### `mmaction.utils`
+
+|          函数           |                            更改                            |
+| :---------------------: | :--------------------------------------------------------: |
+|      `collect_env`      |                          无需更改                          |
+|    `get_root_logger`    |    删除，使用 `mmengine.MMLogger.get_current_instance`     |
+| `setup_multi_processes` | 删除，使用 `mmengine.utils.dl_utils.setup_multi_processes` |
+
+### 其他更改
+
+- 我们将所有注册器的定义从各个包移动到了 `mmaction.registry` 。
diff --git a/docs/zh_cn/notes/ecosystem.md b/docs/zh_cn/notes/ecosystem.md
new file mode 100644
index 0000000000000000000000000000000000000000..7fff5e41f0d22ef826b14d269f5994c4ea60c9d4
--- /dev/null
+++ b/docs/zh_cn/notes/ecosystem.md
@@ -0,0 +1,23 @@
+# 基于 MMAction2 的生态项目
+
+有许多研究工作和项目是基于 MMAction2 构建的。
+我们列举了一些例子，展示了如何扩展 MMAction2 来适用于您自己的项目。
+由于页面可能尚未完成，所以请随时通过提交PR来更新此页面。
+
+## 作为扩展的项目
+
+- [OTEAction2](https://github.com/openvinotoolkit/mmaction2)：用于动作识别的 OpenVINO 训练扩展。
+- [PYSKL](https://github.com/kennymckormick/pyskl)：一个专注于基于骨骼点动作识别的工具箱。
+
+## 论文相关的项目
+
+还有一些与论文一起发布的项目。
+其中一些论文发表在顶级会议（CVPR、ICCV 和 ECCV）上，其他一些也具有很高的影响力。
+我们按照会议时间列出它们，方便社区参考。
+
+- Video Swin Transformer，CVPR 2022 [\[论文\]](https://arxiv.org/abs/2106.13230)[\[github\]](https://github.com/SwinTransformer/Video-Swin-Transformer)
+- Evidential Deep Learning for Open Set Action Recognition，ICCV 2021 Oral [\[论文\]](https://arxiv.org/abs/2107.10161)[\[github\]](https://github.com/Cogito2012/DEAR)
+- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective，ICCV 2021 Oral [\[论文\]](https://arxiv.org/abs/2103.17263)[\[github\]](https://github.com/xvjiarui/VFS)
+- MGSampler: An Explainable Sampling Strategy for Video Action Recognition，ICCV 2021 [\[论文\]](https://arxiv.org/abs/2104.09952)[\[github\]](https://github.com/MCG-NJU/MGSampler)
+- MultiSports: A Multi-Person Video Dataset of Spatio-Temporally Localized Sports Actions，ICCV 2021 [\[论文\]](https://arxiv.org/abs/2105.07404)
+- Long Short-Term Transformer for Online Action Detection，NeurIPS 2021 [\[论文\]](https://arxiv.org/abs/2107.03377)[\[github\]](https://github.com/amazon-research/long-short-term-transformer)
diff --git a/docs/zh_cn/notes/pytorch2.0.md b/docs/zh_cn/notes/pytorch2.0.md
new file mode 100644
index 0000000000000000000000000000000000000000..09499beacd30f21384ebf64ab62e2607a2675d11
--- /dev/null
+++ b/docs/zh_cn/notes/pytorch2.0.md
@@ -0,0 +1,21 @@
+# PyTorch 2.0 Compatibility and Benchmark
+
+PyTorch introduced `torch.compile` in its 2.0 release. It compiles your model to speedup trainning & validation. We provide a benchmark result and compatibility of typical models in MMAction2. Except for one model (MViT) that fails to compile, the performance of other models remains consistent before and after compilation.
+
+| Config                                                                    | compiled | Train time / iter (s) | GPU memory (M) | test metric  |
+| ------------------------------------------------------------------------- | -------- | --------------------- | -------------- | ------------ |
+| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb                    | False    | 0.50                  | 42537          | 36.55        |
+| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb                    | True     | 0.61                  | 53149          | 36.72        |
+| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb                         | False    | 0.688                 | 14263          | 77.69        |
+| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb                         | True     | 0.691                 | 13863          | 77.57        |
+| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d                          | False    | 0.0305                | 1184           | 91.69        |
+| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d                          | True     | 0.0298                | 1273           | 91.64        |
+| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint                           | False    | 0.498                 | 9581           | 93.6         |
+| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint                           | True     | 0.505                 | 11968          | 93.49        |
+| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb            | False    | 0.17                  | 8278           | 20.76        |
+| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb            | True     | 0.1835                | 12004          | 21.67        |
+| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb          | False    | 0.323                 | 21651          | 78.90        |
+| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb          | True     | 0.262                 | 20905          | 78.70        |
+| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | False    | 0.098                 | 5777           | 75.12        |
+| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | True     | 0.0942                | 7095           | 75.15        |
+| mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb                        | Fail     | incompatible          | incompatible   | incompatible |
diff --git a/docs/zh_cn/project_zoo.py b/docs/zh_cn/project_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..88cd0484baf5c4de0ecca8c55abb2e59ad62dd6d
--- /dev/null
+++ b/docs/zh_cn/project_zoo.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+from pathlib import Path
+
+from utils import replace_link
+
+# This script reads /projects/*/README.md and generate projectzoo.md
+
+all_files = list(Path('../../projects/').glob('*/README.md'))
+example_project = '../../projects/example_project/README.md'
+all_files.remove(Path(example_project))
+all_files.insert(0, Path(example_project))
+
+project_zoo = open('../../projects/README.md').read()
+for file in all_files:
+    chinese_readme = Path(str(file).replace('README.md', 'README_zh-CN.md'))
+    if chinese_readme.exists():
+        file = chinese_readme
+    with open(file) as f:
+        content = f.read()
+        content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content,
+                               file)
+        content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content,
+                               file)
+
+        project_zoo += content
+
+with open('projectzoo.md', 'w') as f:
+    f.write(project_zoo)
diff --git a/docs/zh_cn/stat.py b/docs/zh_cn/stat.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ee25119c162e39d7f5a33c0c5a0748bd53ca1d8
--- /dev/null
+++ b/docs/zh_cn/stat.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python
+import re
+import shutil
+from collections import defaultdict
+from pathlib import Path
+
+from modelindex.load_model_index import load
+from modelindex.models.Result import Result
+from tabulate import tabulate
+from utils import replace_link
+
+MMACT_ROOT = Path(__file__).absolute().parents[2]
+PAPERS_ROOT = Path('model_zoo')  # Path to save generated paper pages.
+GITHUB_PREFIX = 'https://github.com/open-mmlab/mmaction2/blob/main/'
+MODELZOO_TEMPLATE = """\
+# 模型库统计
+
+在本页面中，我们列举了我们支持的[所有算法](#所有已支持的算法)。你可以点击链接跳转至对应的模型详情页面。
+
+另外，我们还列出了我们提供的所有模型权重文件。你可以使用排序和搜索功能找到需要的模型权重，并使用链接跳转至模型详情页面。
+
+## 所有已支持的算法
+
+* 论文数量：{num_papers}
+{type_msg}
+
+* 模型权重文件数量：{num_ckpts}
+{paper_msg}
+
+"""  # noqa: E501
+
+METRIC_ALIAS = {
+    'Top 1 Accuracy': 'Top-1 (%)',
+    'Top 5 Accuracy': 'Top-5 (%)',
+}
+
+TASK_MAP = dict(
+    detection='时空行为检测模型',
+    localization='时序动作定位模型',
+    recognition='行为识别模型',
+    skeleton='基于骨骼点的行为识别模型',
+    retrieval='视频检索模型',
+    recognition_audio='基于声音的行为识别模型')
+
+model_index = load(str(MMACT_ROOT / 'model-index.yml'))
+
+
+def build_collections(model_index):
+    # add models for collections
+    col_by_name = {}
+    for col in model_index.collections:
+        setattr(col, 'models', [])
+        col_by_name[col.name] = col
+
+    for model in model_index.models:
+        col = col_by_name[model.in_collection]
+        col.models.append(model)
+        setattr(model, 'collection', col)
+        if model.results is None:
+            setattr(model, 'tasks', [])
+        else:
+            setattr(model, 'tasks', [result.task for result in model.results])
+
+
+build_collections(model_index)
+
+# save a map from model name to title in README
+model2title = dict()
+
+
+def count_papers(collections):
+    total_num_ckpts = 0
+    type_count = defaultdict(int)
+    paper_msgs = []
+
+    for collection in collections:
+        with open(MMACT_ROOT / collection.readme) as f:
+            readme = f.read()
+
+        ckpts = set(x.lower().strip()
+                    for x in re.findall(r'\[ckpt.*\]\((https?.*)\)', readme))
+        total_num_ckpts += len(ckpts)
+        title = collection.paper['Title']
+        papertype = collection.data.get('type', 'Algorithm')
+        type_count[papertype] += 1
+
+        readme_title = re.search(r'^#\s+.+', readme)
+
+        readme = Path(collection.filepath).parents[1].with_suffix('.md').name
+        model = Path(collection.filepath).parent.name
+        model2title[model] = readme_title.group()[2:].replace(' ', '-')
+        paper_msgs.append(f'\t- [{papertype}] [{title}]({PAPERS_ROOT / readme}'
+                          f'#{model2title[model]}) ({len(ckpts)} ckpts)')
+
+    type_msg = '\n'.join(
+        [f'\t- {type_}: {count}' for type_, count in type_count.items()])
+    paper_msg = '\n'.join(paper_msgs)
+
+    modelzoo = MODELZOO_TEMPLATE.format(
+        num_papers=len(collections),
+        num_ckpts=total_num_ckpts,
+        type_msg=type_msg,
+        paper_msg=paper_msg,
+    )
+
+    with open('modelzoo_statistics.md', 'w') as f:
+        f.write(modelzoo)
+
+
+count_papers(model_index.collections)
+
+
+def generate_paper_page(collection):
+
+    # Write a copy of README
+    with open(MMACT_ROOT / collection.readme) as f:
+        content = f.read()
+    readme_path = Path(collection.filepath)
+    copy = PAPERS_ROOT / readme_path.parents[1].with_suffix('.md').name
+    if not copy.exists():
+        with open(copy, 'w') as copy_file:
+            task = readme_path.parents[1].name
+            head_content = f'# {TASK_MAP[task]}\n'
+            copy_file.write(head_content)
+
+    def lower_heading(match):
+        return '#' + match.group()
+
+    content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content,
+                           Path(collection.readme))
+    content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content,
+                           Path(collection.readme))
+
+    content = re.sub(r'^#+\s+.+', lower_heading, content, flags=re.M)
+
+    with open(copy, 'a') as copy_file:
+        copy_file.write(content)
+
+
+if PAPERS_ROOT.exists():
+    shutil.rmtree(PAPERS_ROOT)
+PAPERS_ROOT.mkdir(exist_ok=True)
+for collection in model_index.collections:
+    generate_paper_page(collection)
+
+
+def scatter_results(models):
+    model_result_pairs = []
+    for model in models:
+        if model.results is None:
+            result = Result(task=None, dataset=None, metrics={})
+            model_result_pairs.append((model, result))
+        else:
+            for result in model.results:
+                model_result_pairs.append((model, result))
+    return model_result_pairs
+
+
+def generate_summary_table(task, model_result_pairs, title=None):
+    metrics = set()
+    for model, result in model_result_pairs:
+        if result.task == task:
+            metrics = metrics.union(result.metrics.keys())
+    metrics = sorted(list(metrics))
+
+    rows = []
+
+    def convert2float(number):
+        units = {'M': 1e6, 'G': 1e9, 'T': 1e12}
+        if isinstance(number, str):
+            num = float(number.rstrip('MGT'))
+            number = num * units[number[-1]]
+        return number
+
+    for model, result in model_result_pairs:
+        if result.task != task:
+            continue
+        name = model.name
+        if model.metadata.parameters is not None:
+            params = convert2float(model.metadata.parameters)
+            params = f'{params / 1e6:.2f}'  # Params
+        else:
+            params = None
+        if model.metadata.flops is not None:
+            flops = convert2float(model.metadata.flops)
+            flops = f'{flops / 1e9:.2f}'  # Flops
+        else:
+            flops = None
+
+        readme = Path(
+            model.collection.filepath).parents[1].with_suffix('.md').name
+        model = Path(model.collection.filepath).parent.name
+        page = f'[链接]({PAPERS_ROOT / readme}#{model2title[model]})'
+        model_metrics = []
+        for metric in metrics:
+            model_metrics.append(str(result.metrics.get(metric, '')))
+
+        rows.append([name, params, flops, *model_metrics, page])
+
+    with open('modelzoo_statistics.md', 'a') as f:
+        if title is not None:
+            f.write(f'\n{title}')
+        f.write("""\n```{table}\n:class: model-summary\n""")
+        header = [
+            '模型',
+            '参数量 (M)',
+            'Flops (G)',
+            *[METRIC_ALIAS.get(metric, metric) for metric in metrics],
+            'Readme',
+        ]
+        table_cfg = dict(
+            tablefmt='pipe',
+            floatfmt='.2f',
+            numalign='right',
+            stralign='center')
+        f.write(tabulate(rows, header, **table_cfg))
+        f.write('\n```\n')
+
+
+def generate_dataset_wise_table(task, model_result_pairs, title=None):
+    dataset_rows = defaultdict(list)
+    for model, result in model_result_pairs:
+        if result.task == task:
+            dataset_rows[result.dataset].append((model, result))
+
+    if title is not None:
+        with open('modelzoo_statistics.md', 'a') as f:
+            f.write(f'\n{title}')
+    for dataset, pairs in dataset_rows.items():
+        generate_summary_table(task, pairs, title=f'### {dataset}')
+
+
+model_result_pairs = scatter_results(model_index.models)
+
+# Generate Action Recognition Summary
+generate_dataset_wise_table(
+    task='Action Recognition',
+    model_result_pairs=model_result_pairs,
+    title='## 行为识别',
+)
+
+# Generate Action Detection Summary
+generate_dataset_wise_table(
+    task='Action Detection',
+    model_result_pairs=model_result_pairs,
+    title='## 时空行为检测',
+)
+
+# Generate Skeleton-based Action Recognition Summary
+generate_dataset_wise_table(
+    task='Skeleton-based Action Recognition',
+    model_result_pairs=model_result_pairs,
+    title='## 骨骼点行为识别',
+)
+
+# Generate Video Retrieval Summary
+generate_dataset_wise_table(
+    task='Video Retrieval',
+    model_result_pairs=model_result_pairs,
+    title='## 视频检索',
+)
+
+# Generate Temporal Action Localization Summary
+generate_dataset_wise_table(
+    task='Temporal Action Localization',
+    model_result_pairs=model_result_pairs,
+    title='## 时序动作定位',
+)
diff --git a/docs/zh_cn/switch_language.md b/docs/zh_cn/switch_language.md
new file mode 100644
index 0000000000000000000000000000000000000000..88b3a3777af732797f98e5cba78c68808fa655c2
--- /dev/null
+++ b/docs/zh_cn/switch_language.md
@@ -0,0 +1,3 @@
+## <a href='https://mmaction2.readthedocs.io/en/latest/'>English</a>
+
+## <a href='https://mmaction2.readthedocs.io/zh_CN/latest/'>简体中文</a>
diff --git a/docs/zh_cn/useful_tools.md b/docs/zh_cn/useful_tools.md
new file mode 100644
index 0000000000000000000000000000000000000000..986153fd75700fff514570e2bee8f94761cce264
--- /dev/null
+++ b/docs/zh_cn/useful_tools.md
@@ -0,0 +1,91 @@
+# 分析工具
+
+除了训练/测试脚本外，我们在 `tools/` 目录下还提供了许多有用的工具。
+
+## 分析工具链接
+
+<!-- TOC -->
+
+- [](#分析工具)
+  - [分析工具](#分析工具)
+  - [模型转换](#模型转换)
+    - [准备模型进行发布](#准备模型进行发布)
+  - [杂项](#杂项)
+    - [评估指标](#评估指标)
+    - [打印完整配置](#打印完整配置)
+    - [检查视频](#检查视频)
+    - [多流融合](#多流融合)
+
+<!-- TOC -->
+
+## 模型转换
+
+### 准备模型进行发布
+
+`tools/deployment/publish_model.py` 帮助用户准备他们的模型进行发布。
+
+在将模型上传到 AWS 之前，您可能想要：
+
+（1）将模型权重转换为 CPU 张量。
+（2）删除优化器状态信息。
+（3）计算权重文件的哈希值，并将哈希值添加到文件名中。
+
+```shell
+python tools/deployment/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
+```
+
+例如，
+
+```shell
+python tools/deployment/publish_model.py work_dirs/tsn_r50_8xb32-1x1x3-100e_kinetics400-rgb/latest.pth tsn_r50_1x1x3_100e_kinetics400_rgb.pth
+```
+
+最终输出的文件名将是 `tsn_r50_8xb32-1x1x3-100e_kinetics400-rgb-{hash id}.pth`。
+
+## 杂项
+
+### 评估指标
+
+`tools/analysis_tools/eval_metric.py` 根据配置文件评估保存在文件中的结果的某些指标。
+
+保存的结果文件是通过在 `tools/test.py` 中设置参数 `--out ${RESULT_FILE}` 来创建的，以指示结果文件，其中存储了整个模型的最终输出。
+
+```shell
+python tools/analysis/eval_metric.py ${CONFIG_FILE} ${RESULT_FILE} [--eval ${EVAL_METRICS}] [--cfg-options ${CFG_OPTIONS}] [--eval-options ${EVAL_OPTIONS}]
+```
+
+### 打印完整配置
+
+`tools/analysis_tools/print_config.py` 逐字打印整个配置，展开所有导入项。
+
+```shell
+python tools/analysis_tools/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}]
+```
+
+### 检查视频
+
+`tools/analysis_tools/check_videos.py` 使用指定的视频编码器迭代由输入配置文件指定的所有样本，查找无效的视频（损坏或缺失），并将相应的文件路径保存到输出文件中。请注意，删除无效视频后，用户需要重新生成视频文件列表。
+
+```shell
+python tools/analysis_tools/check_videos.py ${CONFIG} [-h] [--options OPTIONS [OPTIONS ...]] [--cfg-options CFG_OPTIONS [CFG_OPTIONS ...]] [--output-file OUTPUT_FILE] [--split SPLIT] [--decoder DECODER] [--num-processes NUM_PROCESSES] [--remove-corrupted-videos]
+```
+
+### 多流融合
+
+`tools/analysis_tools/report_accuracy.py` 使用推理保存的结果（在测试时设置 `--dump res.pkl`）来融合多流预测分数，即后融合（late fusion）。
+
+```shell
+python tools/analysis_tools/report_accuracy.py [--preds ${RESULT_PKL_1 [RESULT_PKL_2 ...]}] [--coefficients ${COEFFICIENT_1 [COEFFICIENT_2, ...]}] [--apply-softmax]
+```
+
+以 joint-bone 融合为例，这是基于骨骼动作识别任务的一种常见实践。
+
+```shell
+python tools/analysis_tools/report_accuracy.py --preds demo/fuse/joint.pkl demo/fuse/bone.pkl --coefficients 1.0 1.0
+```
+
+```
+Mean Class Accuracy: 0.9180
+Top 1 Accuracy: 0.9333
+Top 5 Accuracy: 0.9833
+```
diff --git a/docs/zh_cn/user_guides/config.md b/docs/zh_cn/user_guides/config.md
new file mode 100644
index 0000000000000000000000000000000000000000..798a9f8884449437550d86eebb7ead7a02a51607
--- /dev/null
+++ b/docs/zh_cn/user_guides/config.md
@@ -0,0 +1,711 @@
+# 学习配置文件
+
+我们使用 Python 文件作为配置文件，将模块化和继承设计融入我们的配置系统中，这方便进行各种实验。
+您可以在 `$MMAction2/configs` 目录下找到所有提供的配置文件。如果您想要查看配置文件，
+您可以运行 `python tools/analysis_tools/print_config.py /PATH/TO/CONFIG` 来查看完整的配置文件。
+
+<!-- TOC -->
+
+- [学习配置文件](#学习配置文件)
+  - [通过脚本参数修改配置](#通过脚本参数修改配置)
+  - [配置文件结构](#配置文件结构)
+  - [配置文件命名约定](#配置文件命名约定)
+    - [动作识别的配置系统](#动作识别的配置系统)
+    - [时空动作检测的配置系统](#时空动作检测的配置系统)
+    - [动作定位的配置系统](#动作定位的配置系统)
+
+<!-- TOC -->
+
+## 通过脚本参数修改配置
+
+在使用 `tools/train.py` 或 `tools/test.py` 提交作业时，您可以通过指定 `--cfg-options` 来原地修改配置。
+
+- 更新字典的配置键。
+
+  可以按照原始配置中字典键的顺序来指定配置选项。
+  例如，`--cfg-options model.backbone.norm_eval=False` 将模型骨干中的所有 BN 模块更改为 `train` 模式。
+
+- 更新配置列表中的键。
+
+  一些配置字典在配置文件中以列表形式组成。例如，训练流程 `train_pipeline` 通常是一个列表，
+  例如 `[dict(type='SampleFrames'), ...]`。如果您想要在流程中将 `'SampleFrames'` 更改为 `'DenseSampleFrames'`，
+  您可以指定 `--cfg-options train_pipeline.0.type=DenseSampleFrames`。
+
+- 更新列表/元组的值。
+
+  如果要更新的值是列表或元组。例如，配置文件通常设置 `model.data_preprocessor.mean=[123.675, 116.28, 103.53]`。如果您想要
+  更改此键，您可以指定 `--cfg-options model.data_preprocessor.mean="[128,128,128]"`。请注意，引号 " 是支持列表/元组数据类型的必需内容。
+
+## 配置文件结构
+
+`configs/_base_` 下有 3 种基本组件类型，即 models、schedules 和 default_runtime。
+许多方法只需要一个模型、一个训练计划和一个默认运行时组件就可以轻松构建，如 TSN、I3D、SlowOnly 等。
+由 `_base_` 组件组成的配置文件被称为 _primitive_。
+
+对于同一文件夹下的所有配置文件，建议只有**一个** _primitive_ 配置文件。其他所有配置文件都应该继承自 _primitive_ 配置文件。这样，继承级别的最大值为 3。
+
+为了方便理解，我们建议贡献者继承现有方法。
+例如，如果基于 TSN 进行了一些修改，用户可以首先通过指定 `_base_ = ../tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` 来继承基本的 TSN 结构，然后在配置文件中修改必要的字段。
+
+如果您正在构建一个与任何现有方法的结构不共享的全新方法，可以在 `configs/TASK` 下创建一个文件夹。
+
+请参考 [mmengine](https://mmengine.readthedocs.io/en/latest/tutorials/config.html) 获取详细文档。
+
+## 配置文件命名约定
+
+我们遵循以下样式来命名配置文件。建议贡献者遵循相同的样式。配置文件名分为几个部分，不同部分逻辑上用下划线 `'_'` 连接，同一部分的设置用破折号 `'-'` 连接。
+
+```
+{算法信息}_{模块信息}_{训练信息}_{数据信息}.py
+```
+
+`{xxx}` 是必填字段，`[yyy]` 是可选字段。
+
+- `{算法信息}`:
+  - `{模型}`: 模型类型，例如 `tsn`、`i3d`、`swin`、`vit` 等。
+  - `[模型设置]`: 某些模型的特定设置，例如 `base`、`p16`、`w877` 等。
+- `{模块信息}`:
+  - `[预训练信息]`: 预训练信息，例如 `kinetics400-pretrained`、`in1k-pre` 等。
+  - `{骨干网络}`: 骨干网络类型，例如 `r50`（ResNet-50）等。
+  - `[骨干网络设置]`: 某些骨干网络的特定设置，例如 `nl-dot-product`、`bnfrozen`、`nopool` 等。
+- `{训练信息}`:
+  - `{gpu x batch_per_gpu]}`: GPU 和每个 GPU 上的样本数。
+  - `{pipeline设置}`: 帧采样设置，例如 `dense`、`{clip_len}x{frame_interval}x{num_clips}`、`u48` 等。
+  - `{schedule}`: 训练计划，例如 `coslr-20e`。
+- `{数据信息}`:
+  - `{数据集}`: 数据集名称，例如 `kinetics400`、`mmit` 等。
+  - `{模态}`: 数据模态，例如 `rgb`、`flow`、`keypoint-2d` 等。
+
+### 动作识别的配置系统
+
+我们将模块化设计融入我们的配置系统中，
+这方便进行各种实验。
+
+- TSN 的示例
+
+  为了帮助用户对完整的配置结构和动作识别系统中的模块有一个基本的了解，
+  我们对 TSN 的配置进行简要注释如下。有关每个模块中每个参数的更详细用法和替代方法，请参阅 API 文档。
+
+  ```python
+  # 模型设置
+  model = dict(  # 模型的配置
+      type='Recognizer2D',  # 识别器的类名
+      backbone=dict(  # 骨干网络的配置
+          type='ResNet',  # 骨干网络的名称
+          pretrained='torchvision://resnet50',  # 预训练模型的 URL/网站
+          depth=50,  # ResNet 模型的深度
+          norm_eval=False),  # 是否在训练时将 BN 层设置为评估模式
+      cls_head=dict(  # 分类头的配置
+          type='TSNHead',  # 分类头的名称
+          num_classes=400,  # 要分类的类别数量。
+          in_channels=2048,  # 分类头的输入通道数。
+          spatial_type='avg',  # 空间维度池化的类型
+          consensus=dict(type='AvgConsensus', dim=1),  # 一致性模块的配置
+          dropout_ratio=0.4,  # dropout 层中的概率
+          init_std=0.01, # 线性层初始化的标准差值
+          average_clips='prob'),  # 平均多个剪辑结果的方法
+      data_preprocessor=dict(  # 数据预处理器的配置
+          type='ActionDataPreprocessor',  # 数据预处理器的名称
+          mean=[123.675, 116.28, 103.53],  # 不同通道的均值用于归一化
+          std=[58.395, 57.12, 57.375],  # 不同通道的标准差用于归一化
+          format_shape='NCHW'),  # 最终图像形状的格式
+      # 模型训练和测试设置
+      train_cfg=None,  # TSN 的训练超参数的配置
+      test_cfg=None)  # TSN 的测试超参数的配置
+
+  # 数据集设置
+  dataset_type = 'RawframeDataset'  # 用于训练、验证和测试的数据集类型
+  data_root = 'data/kinetics400/rawframes_train/'  # 用于训练的数据的根路径
+  data_root_val = 'data/kinetics400/rawframes_val/'  # 用于验证和测试的数据的根路径
+  ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'  # 用于训练的注释文件的路径
+  ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'  # 用于验证的注释文件的路径
+  ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'  # 用于测试的注释文件的路径
+
+  train_pipeline = [  # 训练数据处理流程
+      dict(  # SampleFrames 的配置
+          type='SampleFrames',  # 采样帧的流程，从视频中采样帧
+          clip_len=1,  # 每个采样输出剪辑的帧数
+          frame_interval=1,  # 相邻采样帧的时间间隔
+          num_clips=3),  # 要采样的剪辑数
+      dict(  # RawFrameDecode 的配置
+          type='RawFrameDecode'),  # 加载和解码帧的流程，选择给定索引的原始帧
+      dict(  # Resize 的配置
+          type='Resize',  # 调整大小的流程
+          scale=(-1, 256)),  # 要调整图像的比例
+      dict(  # MultiScaleCrop 的配置
+          type='MultiScaleCrop',  # 多尺度裁剪的流程，根据随机选择的尺度列表裁剪图像
+          input_size=224,  # 网络的输入大小
+          scales=(1, 0.875, 0.75, 0.66),  # 要选择的宽度和高度的尺度
+          random_crop=False,  # 是否随机采样裁剪框
+          max_wh_scale_gap=1),  # 宽度和高度尺度级别的最大差距
+      dict(  # Resize 的配置
+          type='Resize',  # 调整大小的流程
+          scale=(224, 224),  # 要调整图像的比例
+          keep_ratio=False),  # 是否保持纵横比进行调整大小
+      dict(  # Flip 的配置
+          type='Flip',  # 翻转的流程
+          flip_ratio=0.5),  # 实施翻转的概率
+      dict(  # FormatShape 的配置
+          type='FormatShape',  # 格式化形状的流程，将最终图像形状格式化为给定的 input_format
+          input_format='NCHW'),  # 最终图像形状的格式
+      dict(type='PackActionInputs')  # PackActionInputs 的配置
+  ]
+  val_pipeline = [  # 验证数据处理流程
+      dict(  # SampleFrames 的配置
+          type='SampleFrames',  # 采样帧的流程，从视频中采样帧
+          clip_len=1,  # 每个采样输出剪辑的帧数
+          frame_interval=1,  # 相邻采样帧的时间间隔
+          num_clips=3,  # 要采样的剪辑数
+          test_mode=True),  # 是否在采样时设置为测试模式
+      dict(  # RawFrameDecode 的配置
+          type='RawFrameDecode'),  # 加载和解码帧的流程，选择给定索引的原始帧
+      dict(  # Resize 的配置
+          type='Resize',  # 调整大小的流程
+          scale=(-1, 256)),  # 要调整图像的比例
+      dict(  # CenterCrop 的配置
+          type='CenterCrop',  # 中心裁剪的流程，从图像中裁剪中心区域
+          crop_size=224),  # 要裁剪的图像大小
+      dict(  # Flip 的配置
+          type='Flip',  # 翻转的流程
+          flip_ratio=0),  # 实施翻转的概率
+      dict(  # FormatShape 的配置
+          type='FormatShape',  # 格式化形状的流程，将最终图像形状格式化为给定的 input_format
+          input_format='NCHW'),  # 最终图像形状的格式
+      dict(type='PackActionInputs')  # PackActionInputs 的配置
+  ]
+  test_pipeline = [  # 测试数据处理流程
+      dict(  # SampleFrames 的配置
+          type='SampleFrames',  # 采样帧的流程，从视频中采样帧
+          clip_len=1,  # 每个采样输出剪辑的帧数
+          frame_interval=1,  # 相邻采样帧的时间间隔
+          num_clips=25,  # 要采样的剪辑数
+          test_mode=True),  # 是否在采样时设置为测试模式
+      dict(  # RawFrameDecode 的配置
+          type='RawFrameDecode'),  # 加载和解码帧的流程，选择给定索引的原始帧
+      dict(  # Resize 的配置
+          type='Resize',  # 调整大小的流程
+          scale=(-1, 256)),  # 要调整图像的比例
+      dict(  # TenCrop 的配置
+          type='TenCrop',  # 十次裁剪的流程，从图像中裁剪十个区域
+          crop_size=224),  # 要裁剪的图像大小
+      dict(  # Flip 的配置
+          type='Flip',  # 翻转的流程
+          flip_ratio=0),  # 实施翻转的概率
+      dict(  # FormatShape 的配置
+          type='FormatShape',  # 格式化形状的流程，将最终图像形状格式化为给定的 input_format
+          input_format='NCHW'),  # 最终图像形状的格式
+      dict(type='PackActionInputs')  # PackActionInputs 的配置
+  ]
+
+  train_dataloader = dict(  # 训练数据加载器的配置
+      batch_size=32,  # 训练时每个单个 GPU 的批量大小
+      num_workers=8,  # 训练时每个单个 GPU 的数据预取进程数
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭工作进程，这可以加速训练速度
+      sampler=dict(
+          type='DefaultSampler',  # 支持分布式和非分布式训练的 DefaultSampler。参考 https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py
+          shuffle=True),  # 每个 epoch 随机打乱训练数据
+      dataset=dict(  # 训练数据集的配置
+          type=dataset_type,
+          ann_file=ann_file_train,  # 注释文件的路径
+          data_prefix=dict(img=data_root),  # 帧路径的前缀
+          pipeline=train_pipeline))
+  val_dataloader = dict(  # 验证数据加载器的配置
+      batch_size=1,  # 验证时每个单个 GPU 的批量大小
+      num_workers=8,  # 验证时每个单个 GPU 的数据预取进程数
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭工作进程
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # 验证和测试时不进行随机打乱
+      dataset=dict(  # 验证数据集的配置
+          type=dataset_type,
+          ann_file=ann_file_val,  # 注释文件的路径
+          data_prefix=dict(img=data_root_val),  # 帧路径的前缀
+          pipeline=val_pipeline,
+          test_mode=True))
+  test_dataloader = dict(  # 测试数据加载器的配置
+      batch_size=32,  # 测试时每个单个 GPU 的批量大小
+      num_workers=8,  # 测试时每个单个 GPU 的数据预取进程数
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭工作进程
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # 验证和测试时不进行随机打乱
+      dataset=dict(  # 测试数据集的配置
+          type=dataset_type,
+          ann_file=ann_file_val,  # 注释文件的路径
+          data_prefix=dict(img=data_root_val),  # 帧路径的前缀
+          pipeline=test_pipeline,
+          test_mode=True))
+
+  # 评估设置
+  val_evaluator = dict(type='AccMetric')  # 验证评估器的配置
+  test_evaluator = val_evaluator  # 测试评估器的配置
+
+  train_cfg = dict(  # 训练循环的配置
+      type='EpochBasedTrainLoop',  # 训练循环的名称
+      max_epochs=100,  # 总的训练周期数
+      val_begin=1,  # 开始验证的训练周期
+      val_interval=1)  # 验证间隔
+  val_cfg = dict(  # 验证循环的配置
+      type='ValLoop')  # 验证循环的名称
+  test_cfg = dict( # 测试循环的配置
+      type='TestLoop')  # 测试循环的名称
+
+  # 学习策略
+  param_scheduler = [  # 更新优化器参数的学习率测率，支持字典或列表
+      dict(type='MultiStepLR',  # 达到一个里程碑时衰减学习率
+          begin=0,  # 开始更新学习率的步骤
+          end=100,  # 结束更新学习率的步骤
+          by_epoch=True,  # 是否按 epoch 更新学习率
+          milestones=[40, 80],  # 衰减学习率的步骤
+          gamma=0.1)]  # 学习率衰减的乘法因子
+
+  # 优化器
+  optim_wrapper = dict(  # 优化器包装器的配置
+      type='OptimWrapper',  # 优化器包装器的名称，切换到 AmpOptimWrapper 可以启用混合精度训练
+      optimizer=dict(  # 优化器的配置。支持 PyTorch 中的各种优化器。参考 https://pytorch.org/docs/stable/optim.html#algorithms
+          type='SGD',  # 优化器的名称
+          lr=0.01,  # 学习率
+          momentum=0.9,  # 动量因子
+          weight_decay=0.0001),  # 权重衰减
+      clip_grad=dict(max_norm=40, norm_type=2))  # 梯度裁剪的配置
+
+  # 运行时设置
+  default_scope = 'mmaction'  # 用于查找模块的默认注册表作用域。参考 https://mmengine.readthedocs.io/en/latest/tutorials/registry.html
+  default_hooks = dict(  # 执行默认操作的钩子，如更新模型参数和保存权重。
+      runtime_info=dict(type='RuntimeInfoHook'),  # 将运行时信息更新到消息中心的钩子
+      timer=dict(type='IterTimerHook'),  # 用于记录迭代过程中花费的时间的日志记录器
+      logger=dict(
+          type='LoggerHook',  # 用于记录训练/验证/测试阶段的日志记录器
+          interval=20,  # 打印日志的间隔
+          ignore_last=False), # 忽略每个 epoch 中最后几个迭代的日志
+      param_scheduler=dict(type='ParamSchedulerHook'),  # 更新优化器中某些超参数的钩子
+      checkpoint=dict(
+          type='CheckpointHook',  # 定期保存权重的钩子
+          interval=3,  # 保存的周期
+          save_best='auto',  # 用于评估最佳权重的指标
+          max_keep_ckpts=3),  # 保留的最大权重文件数量
+      sampler_seed=dict(type='DistSamplerSeedHook'),  # 用于分布式训练的数据加载采样器
+      sync_buffers=dict(type='SyncBuffersHook'))  # 在每个 epoch 结束时同步模型缓冲区
+
+  env_cfg = dict(  # 设置环境的字典
+      cudnn_benchmark=False,  # 是否启用 cudnn benchmark
+      mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # 设置多进程的参数
+      dist_cfg=dict(backend='nccl')) # 设置分布式环境的参数，也可以设置端口号
+
+  log_processor = dict(
+      type='LogProcessor',  # 用于格式化日志信息的日志处理器
+      window_size=20,  # 默认的平滑间隔
+      by_epoch=True)  # 是否使用 epoch 类型格式化日志
+  vis_backends = [  # 可视化后端的列表
+      dict(type='LocalVisBackend')]  # 本地可视化后端
+  visualizer = dict(  # 可视化器的配置
+      type='ActionVisualizer',  # 可视化器的名称
+      vis_backends=vis_backends)
+  log_level = 'INFO'  # 日志记录的级别
+  load_from = None  # 从给定路径加载模型权重作为预训练模型。这不会恢复训练。
+  resume = False  # 是否从 `load_from` 中定义的权重恢复。如果 `load_from` 为 None，则会从 `work_dir` 中恢复最新的权重。
+  ```
+
+### 时空动作检测的配置系统
+
+我们将模块化设计融入我们的配置系统中，这方便进行各种实验。
+
+- FastRCNN 的示例
+
+  为了帮助用户对完整的配置结构和时空动作检测系统中的模块有一个基本的了解，
+  我们对 FastRCNN 的配置进行简要注释如下。有关每个模块中每个参数的更详细用法和替代方法，请参阅 API 文档。
+
+  ```python
+  # 模型设置
+  model = dict(  # 模型的配置
+      type='FastRCNN',  # 检测器的类名
+      _scope_='mmdet',  # 当前配置的范围
+      backbone=dict(  # 骨干网络的配置
+          type='ResNet3dSlowOnly',  # 骨干网络的名称
+          depth=50, # ResNet 模型的深度
+          pretrained=None,   # 预训练模型的 URL/网站
+          pretrained2d=False, # 如果预训练模型是 2D 的
+          lateral=False,  # 如果骨干网络带有横向连接
+          num_stages=4, # ResNet 模型的阶段数
+          conv1_kernel=(1, 7, 7), # Conv1 的卷积核大小
+          conv1_stride_t=1, # Conv1 的时间步长
+          pool1_stride_t=1, # Pool1 的时间步长
+          spatial_strides=(1, 2, 2, 1)),  # 每个 ResNet 阶段的空间步长
+      roi_head=dict(  # roi_head 的配置
+          type='AVARoIHead',  # roi_head 的名称
+          bbox_roi_extractor=dict(  # bbox_roi_extractor 的配置
+              type='SingleRoIExtractor3D',  # bbox_roi_extractor 的名称
+              roi_layer_type='RoIAlign',  # RoI 操作的类型
+              output_size=8,  # RoI 操作的输出特征大小
+              with_temporal_pool=True), # 是否进行时间维度的池化
+          bbox_head=dict( # bbox_head 的配置
+              type='BBoxHeadAVA', # bbox_head 的名称
+              in_channels=2048, # 输入特征的通道数
+              num_classes=81, # 动作类别数 + 1
+              multilabel=True,  # 数据集是否为多标签
+              dropout_ratio=0.5),  # 使用的 dropout 比例
+      data_preprocessor=dict(  # 数据预处理器的配置
+          type='ActionDataPreprocessor',  # 数据预处理器的名称
+          mean=[123.675, 116.28, 103.53],  # 不同通道的均值用于归一化
+          std=[58.395, 57.12, 57.375],  # 不同通道的标准差用于归一化
+          format_shape='NCHW'))  # 最终图像形状的格式
+      train_cfg=dict(
+          rcnn=dict(
+              assigner=dict(
+                  type='MaxIoUAssignerAVA',  # 分配器的名称
+                  pos_iou_thr=0.9,  # 正样本的 IoU 阈值，> pos_iou_thr -> 正样本
+                  neg_iou_thr=0.9,  # 负样本的 IoU 阈值，< neg_iou_thr -> 负样本
+                  min_pos_iou=0.9),  # 正样本的最小可接受 IoU
+              sampler=dict(
+                  type='RandomSampler',  # 采样器的名称
+                  num=32,  # 采样器的批处理大小
+                  pos_fraction=1,  # 采样器的正样本比例
+                  neg_pos_ub=-1,  # 负样本与正样本数量比率的上限
+                  add_gt_as_proposals=True),  # 将 gt 边界框添加到 proposals 中
+              pos_weight=1.0)),  # 正样本的损失权重
+      test_cfg=dict(rcnn=None))  # 测试的配置
+
+  # 数据集设置
+  dataset_type = 'AVADataset'  # 训练、验证和测试的数据集类型
+  data_root = 'data/ava/rawframes'  # 数据的根目录
+  anno_root = 'data/ava/annotations'  # 注释的根目录
+
+  ann_file_train = f'{anno_root}/ava_train_v2.1.csv'  # 训练注释文件的路径
+  ann_file_val = f'{anno_root}/ava_val_v2.1.csv'  # 验证注释文件的路径
+
+  exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'  # 训练排除注释文件的路径
+  exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'  # 验证排除注释文件的路径
+
+  label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'  # 标签文件的路径
+
+  proposal_file_train = f'{anno_root}/ava_dense_proposals_train.FAIR.recall_93.9.pkl'  # 训练示例的人体检测 proposals 文件的路径
+  proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'  # 验证示例的人体检测 proposals 文件的路径
+
+  train_pipeline = [
+      dict(
+          type='AVASampleFrames',  # 从视频中采样帧的管道
+          clip_len=4,  # 每个采样输出的帧数
+          frame_interval=16),  # 相邻采样帧之间的时间间隔
+      dict(
+          type='RawFrameDecode'),  # 加载和解码帧的管道，使用给定的索引选择原始帧
+      dict(
+          type='RandomRescale',  # 随机缩放短边
+          scale_range=(256, 320)),  # 随机缩放的短边尺寸范围
+      dict(
+          type='RandomCrop',  # 随机裁剪给定大小的补丁
+          size=256),  # 裁剪补丁的大小
+      dict(
+          type='Flip',  # 翻转管道
+          flip_ratio=0.5),  # 翻转的概率
+      dict(
+          type='FormatShape',  # 格式化形状的管道，将最终图像形状格式化为给定的输入格式
+          input_format='NCTHW',  # 最终图像形状的格式
+          collapse=True),  # 如果 N == 1，则减少维度 N
+      dict(type='PackActionInputs')  # 打包输入数据
+  ]
+
+  val_pipeline = [
+      dict(
+          type='AVASampleFrames',  # 从视频中采样帧的管道
+          clip_len=4,  # 每个采样输出的帧数
+          frame_interval=16),  # 相邻采样帧之间的时间间隔
+      dict(
+          type='RawFrameDecode'),  # 加载和解码帧的管道，使用给定的索引选择原始帧
+      dict(
+          type='Resize',  # 调整大小的管道
+          scale=(-1, 256)),  # 调整图像的尺度
+      dict(
+          type='FormatShape',  # 格式化形状的管道，将最终图像形状格式化为给定的输入格式
+          input_format='NCTHW',  # 最终图像形状的格式
+          collapse=True),  # 如果 N == 1，则减少维度 N
+      dict(type='PackActionInputs')  # 打包输入数据
+  ]
+
+  train_dataloader = dict(
+      batch_size=32,  # 每个单 GPU 训练的批处理大小
+      num_workers=8,  # 每个单 GPU 训练时预取数据的 worker 数量
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭 worker 进程，这可以加快训练速度
+      sampler=dict(
+          type='DefaultSampler',  # 默认采样器，支持分布式和非分布式训练。参考 https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py
+          shuffle=True),  # 在每个 epoch 中随机打乱训练数据
+      dataset=dict(
+          type=dataset_type,
+          ann_file=ann_file_train,  # 注释文件的路径
+          exclude_file=exclude_file_train,  # 排除注释文件的路径
+          label_file=label_file,  # 标签文件的路径
+          data_prefix=dict(img=data_root),  # 帧路径的前缀
+          proposal_file=proposal_file_train,  # 人体检测 proposals 的路径
+          pipeline=train_pipeline)
+  )
+  val_dataloader = dict(
+      batch_size=1,  # 每个单 GPU 评估的批处理大小
+      num_workers=8,  # 每个单 GPU 评估时预取数据的 worker 数量
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭 worker 进程
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # 在验证和测试时不打乱数据
+      dataset=dict(
+          type=dataset_type,
+          ann_file=ann_file_val,  # 注释文件的路径
+          exclude_file=exclude_file_val,  # 排除注释文件的路径
+          label_file=label_file,  # 标签文件的路径
+          data_prefix=dict(img=data_root_val),  # 帧路径的前缀
+          proposal_file=proposal_file_val,  # 人体检测 proposals 的路径
+          pipeline=val_pipeline,
+          test_mode=True)
+  )
+  test_dataloader = val_dataloader  # 测试数据加载器的配置
+
+  # 评估设置
+  val_evaluator = dict(
+      type='AVAMetric',
+      ann_file=ann_file_val,
+      label_file=label_file,
+      exclude_file=exclude_file_val)
+  test_evaluator = val_evaluator  # 测试评估器的配置
+
+  train_cfg = dict(
+      type='EpochBasedTrainLoop',  # 训练循环的名称
+      max_epochs=20,  # 总的训练 epoch 数量
+      val_begin=1,  # 开始验证的 epoch
+      val_interval=1)  # 验证的间隔
+  val_cfg = dict(
+      type='ValLoop')  # 验证循环的名称
+  test_cfg = dict(
+      type='TestLoop')  # 测试循环的名称
+
+  # 学习策略
+  param_scheduler = [
+      dict(
+          type='LinearLR',  # 线性减少每个参数组的学习率
+          start_factor=0.1,  # 第一个 epoch 中学习率的乘法因子
+          by_epoch=True,  # 是否按 epoch 更新学习率
+          begin=0,  # 开始更新学习率的步骤
+          end=5),  # 停止更新学习率的步骤
+      dict(
+          type='MultiStepLR',  # 当 epoch 数达到里程碑时，减少学习率
+          begin=0,  # 开始更新学习率的步骤
+          end=20,  # 停止更新学习率的步骤
+          by_epoch=True,  # 是否按 epoch 更新学习率
+          milestones=[10, 15],  # 学习率衰减的步骤
+          gamma=0.1)  # 学习率衰减的乘法因子
+  ]
+
+  # 优化器
+  optim_wrapper = dict(
+      type='OptimWrapper',  # 优化器包装器的名称，切换到 AmpOptimWrapper 以启用混合精度训练
+      optimizer=dict(
+          type='SGD',  # 优化器的名称
+          lr=0.2,  # 学习率
+          momentum=0.9,  # 动量因子
+          weight_decay=0.0001),  # 权重衰减
+      clip_grad=dict(max_norm=40, norm_type=2))  # 梯度剪裁的配置
+
+  # 运行时设置
+  default_scope = 'mmaction'  # 默认注册表范围，用于查找模块。参考 https://mmengine.readthedocs.io/en/latest/tutorials/registry.html
+  default_hooks = dict(
+      runtime_info=dict(type='RuntimeInfoHook'),  # 将运行时信息更新到消息中心的钩子
+      timer=dict(type='IterTimerHook'),  # 用于记录迭代过程中花费的时间的日志记录器
+      logger=dict(
+          type='LoggerHook',  # 用于记录训练/验证/测试阶段的日志的日志记录器
+          interval=20,  # 打印日志的间隔
+          ignore_last=False),  # 忽略每个 epoch 中最后几次迭代的日志
+      param_scheduler=dict(type='ParamSchedulerHook'),  # 更新优化器中的某些超参数的钩子
+      checkpoint=dict(
+          type='CheckpointHook',  # 定期保存权重的钩子
+          interval=3,  # 保存周期
+          save_best='auto',  # 在评估过程中测量最佳权重的指标
+          max_keep_ckpts=3),  # 保留的最大权重文件数量
+      sampler_seed=dict(type='DistSamplerSeedHook'),  # 用于分布式训练的数据加载采样器
+      sync_buffers=dict(type='SyncBuffersHook'))  # 在每个 epoch 结束时同步模型缓冲区的钩子
+  env_cfg = dict(
+      cudnn_benchmark=False,  # 是否启用 cudnn 的基准测试
+      mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),  # 设置多进程的参数
+      dist_cfg=dict(backend='nccl'))  # 设置分布式环境的参数，也可以设置端口
+
+  log_processor = dict(
+      type='LogProcessor',  # 用于格式化日志信息的日志处理器
+      window_size=20,  # 默认平滑间隔
+      by_epoch=True)  # 是否使用 epoch 类型格式化日志
+  vis_backends = [
+      dict(type='LocalVisBackend')]  # 可视化后端的列表
+  visualizer = dict(
+      type='ActionVisualizer',  # 可视化器的名称
+      vis_backends=vis_backends)
+  log_level = 'INFO'  # 日志级别
+  load_from = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+              'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/'
+              'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth')  # 从给定路径加载模型权重作为预训练模型。这不会恢复训练。
+  resume = False  # 是否从 `load_from` 中定义的权重恢复训练。如果 `load_from` 为 None，则会从 `work_dir` 中恢复最新的权重。
+  ```
+
+### 动作定位的配置系统
+
+我们将模块化设计引入了配置系统中，方便进行各种实验。
+
+- BMN 的示例
+
+  为了帮助用户对完整的配置结构和动作定位系统中的模块有一个基本的了解，我们对 BMN 的配置进行了简要注释，具体如下所示。有关每个模块中每个参数的更详细用法和替代方法，请参阅 [API 文档](https://mmaction2.readthedocs.io/en/latest/api.html)。
+
+  ```python
+  # 模型设置
+  model = dict(
+      type='BMN',  # 定位器的类名
+      temporal_dim=100,  # 每个视频选取的总帧数
+      boundary_ratio=0.5,  # 确定视频边界的比率
+      num_samples=32,  # 每个 proposal 的采样数量
+      num_samples_per_bin=3,  # 每个采样的 bin 的采样数量
+      feat_dim=400,  # 特征的维度
+      soft_nms_alpha=0.4,  # Soft NMS 的 alpha 值
+      soft_nms_low_threshold=0.5,  # Soft NMS 的低阈值
+      soft_nms_high_threshold=0.9,  # Soft NMS 的高阈值
+      post_process_top_k=100)  # 后处理中的 top-k proposal 数量
+
+  # 数据集设置
+  dataset_type = 'ActivityNetDataset'  # 用于训练、验证和测试的数据集类型
+  data_root = 'data/activitynet_feature_cuhk/csv_mean_100/'  # 用于训练的数据的根目录
+  data_root_val = 'data/activitynet_feature_cuhk/csv_mean_100/'  # 用于验证和测试的数据的根目录
+  ann_file_train = 'data/ActivityNet/anet_anno_train.json'  # 用于训练的注释文件的路径
+  ann_file_val = 'data/ActivityNet/anet_anno_val.json'  # 用于验证的注释文件的路径
+  ann_file_test = 'data/ActivityNet/anet_anno_test.json'  # 用于测试的注释文件的路径
+
+  train_pipeline = [
+      dict(type='LoadLocalizationFeature'),  # 加载定位特征的管道
+      dict(type='GenerateLocalizationLabels'),  # 生成定位标签的管道
+      dict(
+          type='PackLocalizationInputs',  # 打包定位数据
+          keys=('gt_bbox'),  # 输入的键
+          meta_keys=('video_name'))]  # 输入的元键
+  val_pipeline = [
+      dict(type='LoadLocalizationFeature'),  # 加载定位特征的管道
+      dict(type='GenerateLocalizationLabels'),  # 生成定位标签的管道
+      dict(
+          type='PackLocalizationInputs',  # 打包定位数据
+          keys=('gt_bbox'),   # 输入的键
+          meta_keys=('video_name', 'duration_second', 'duration_frame',
+                     'annotations', 'feature_frame'))]  # 输入的元键
+  test_pipeline = [
+      dict(type='LoadLocalizationFeature'),  # 加载定位特征的管道
+      dict(
+          type='PackLocalizationInputs',  # 打包定位数据
+          keys=('gt_bbox'),  # 输入的键
+          meta_keys=('video_name', 'duration_second', 'duration_frame',
+                     'annotations', 'feature_frame'))]  # 输入的元键
+  train_dataloader = dict(
+      batch_size=8,  # 每个单 GPU 训练的批处理大小
+      num_workers=8,  # 每个单 GPU 训练时预取数据的 worker 数量
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭 worker 进程，这可以加快训练速度
+      sampler=dict(
+          type='DefaultSampler',  # 默认采样器，支持分布式和非分布式训练。参考 https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py
+          shuffle=True),  # 在每个 epoch 中随机打乱训练数据
+      dataset=dict(
+          type=dataset_type,
+          ann_file=ann_file_train,  # 注释文件的路径
+          data_prefix=dict(video=data_root),  # 视频路径的前缀
+          pipeline=train_pipeline)
+  )
+  val_dataloader = dict(
+      batch_size=1,  # 每个单 GPU 评估的批处理大小
+      num_workers=8,  # 每个单 GPU 评估时预取数据的 worker 数量
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭 worker 进程
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # 在验证和测试时不打乱数据
+      dataset=dict(
+          type=dataset_type,
+          ann_file=ann_file_val,  # 注释文件的路径
+          data_prefix=dict(video=data_root_val),  # 视频路径的前缀
+          pipeline=val_pipeline,
+          test_mode=True)
+  )
+  test_dataloader = dict(
+      batch_size=1,  # 每个单 GPU 测试的批处理大小
+      num_workers=8,  # 每个单 GPU 测试时预取数据的 worker 数量
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭 worker 进程
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # 在验证和测试时不打乱数据
+      dataset=dict(
+          type=dataset_type,
+          ann_file=ann_file_val,  # 注释文件的路径
+          data_prefix=dict(video=data_root_val),  # 视频路径的前缀
+          pipeline=test_pipeline,
+          test_mode=True)
+  )
+
+  # 评估设置
+  work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/'  # 保存当前实验的模型权重和日志的目录
+  val_evaluator = dict(
+      type='ANetMetric',
+      metric_type='AR@AN',
+      dump_config=dict(
+          out=f'{work_dir}/results.json',  # 输出文件的路径
+          output_format='json'))  # 输出文件的格式
+  test_evaluator = val_evaluator  # 将 test_evaluator 设置为 val_evaluator
+
+  max_epochs = 9  # 训练模型的总 epoch 数量
+  train_cfg = dict(
+      type='EpochBasedTrainLoop',  # 训练循环的名称
+      max_epochs=max_epochs,  # 总的训练 epoch 数量
+      val_begin=1,  # 开始验证的 epoch
+      val_interval=1)  # 验证的间隔
+  val_cfg = dict(
+      type='ValLoop')  # 验证循环的名称
+  test_cfg = dict(
+      type='TestLoop')  # 测试循环的名称
+
+  # 学习策略
+  param_scheduler = [
+      dict(
+          type='MultiStepLR',  # 当 epoch 数达到里程碑时，减少学习率
+          begin=0,  # 开始更新学习率的步骤
+          end=max_epochs,  # 停止更新学习率的步骤
+          by_epoch=True,  # 是否按 epoch 更新学习率
+          milestones=[7, ],  # 学习率衰减的步骤
+          gamma=0.1)  # 学习率衰减的乘法因子
+  ]
+
+  # 优化器
+  optim_wrapper = dict(
+      type='OptimWrapper',  # 优化器包装器的名称，切换到 AmpOptimWrapper 以启用混合精度训练
+      optimizer=dict(
+          type='Adam',  # 优化器的名称
+          lr=0.001,  # 学习率
+          weight_decay=0.0001),  # 权重衰减
+      clip_grad=dict(max_norm=40, norm_type=2))  # 梯度剪裁的配置
+
+  # 运行时设置
+  default_scope = 'mmaction'  # 默认注册表范围，用于查找模块。参考 https://mmengine.readthedocs.io/en/latest/tutorials/registry.html
+  default_hooks = dict(
+      runtime_info=dict(type='RuntimeInfoHook'),  # 将运行时信息更新到消息中心的钩子
+      timer=dict(type='IterTimerHook'),  # 用于记录迭代过程中花费的时间的日志记录器
+      logger=dict(
+          type='LoggerHook',  # 用于记录训练/验证/测试阶段的日志的日志记录器
+          interval=20,  # 打印日志的间隔
+          ignore_last=False),  # 忽略每个 epoch 中最后几次迭代的日志
+      param_scheduler=dict(type='ParamSchedulerHook'),  # 更新优化器中的某些超参数的钩子
+      checkpoint=dict(
+          type='CheckpointHook',  # 定期保存权重的钩子
+          interval=3,  # 保存周期
+          save_best='auto',  # 在评估过程中测量最佳权重的指标
+          max_keep_ckpts=3),  # 保留的最大权重文件数量
+      sampler_seed=dict(type='DistSamplerSeedHook'),  # 用于分布式训练的数据加载采样器
+      sync_buffers=dict(type='SyncBuffersHook'))  # 在每个 epoch 结束时同步模型缓冲区的钩子
+  env_cfg = dict(
+      cudnn_benchmark=False,  # 是否启用 cudnn 的基准测试
+      mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),  # 设置多进程的参数
+      dist_cfg=dict(backend='nccl'))  # 设置分布式环境的参数，也可以设置端口
+
+  log_processor = dict(
+      type='LogProcessor',  # 用于格式化日志信息的日志处理器
+      window_size=20,  # 默认平滑间隔
+      by_epoch=True)  # 是否使用 epoch 类型格式化日志
+  vis_backends = [
+      dict(type='LocalVisBackend')]  # 可视化后端的列表
+  visualizer = dict(
+      type='ActionVisualizer',  # 可视化器的名称
+      vis_backends=vis_backends)
+  log_level = 'INFO'  # 日志级别
+  load_from = None  # 从给定路径加载模型权重作为预训练模型。这不会恢复训练。
+  resume = False  # 是否从 `load_from` 中定义的权重恢复训练。如果 `load_from` 为 None，则会从 `work_dir` 中恢复最新的权重。
+  ```
diff --git a/docs/zh_cn/user_guides/finetune.md b/docs/zh_cn/user_guides/finetune.md
new file mode 100644
index 0000000000000000000000000000000000000000..9c5e674e5afab07c532dfed5b08bb04dec05e7fe
--- /dev/null
+++ b/docs/zh_cn/user_guides/finetune.md
@@ -0,0 +1,320 @@
+# 模型微调
+
+本教程提供了使用预训练模型在其他数据集上进行微调的指导。通过微调，可以获得更好的性能。
+
+- [模型微调](#模型微调)
+  - [概述](#概述)
+  - [选择模板配置](#选择模板配置)
+  - [修改 Head](#修改-head)
+  - [修改数据集](#修改数据集)
+  - [修改训练计划](#修改训练计划)
+  - [使用预训练模型](#使用预训练模型)
+  - [开始训练](#开始训练)
+
+## 概述
+
+在新数据集上进行模型微调有两个步骤。
+
+1. 添加对新数据集的支持。请参考[准备数据集](prepare_dataset.md)和[自定义数据集](../advanced_guides/customize_dataset.md)。
+2. 修改配置文件。本教程将讨论这一部分。
+
+## 选择模板配置
+
+这里我们以 `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` 为例。我们首先将该配置文件复制到同一文件夹，并将其重命名为 `tsn_ucf101.py`，然后需要注意配置中的四个部分，具体来说，为不存在的键添加新键，并修改现有键的原始键。
+
+## 修改 Head
+
+`cls_head` 中的 `num_classes` 需要更改为新数据集的类别数。预训练模型的权重会被重用，除了最后的预测层。因此，更改类别数是安全的。在我们的例子中，UCF101 有 101 个类别。所以我们将其从 400（Kinetics-400 的类别数）改为 101。
+
+```python
+# model settings
+model = dict(
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=101  # 将 400 修改为 101
+        ))
+```
+
+## 修改数据集
+
+MMAction2 支持 UCF101、Kinetics-400、Moments in Time、Multi-Moments in Time、THUMOS14、Something-Something V1&V2、ActivityNet 数据集。用户可能需要将上述其中一个数据集适应到他们的特殊数据集上。你可以参考[准备数据集](prepare_dataset.md)和[自定义数据集](../advanced_guides/customize_dataset.md)了解更多细节。在我们的例子中，UCF101 已经由各种数据集类型支持，例如 `VideoDataset`，因此我们将配置修改如下。
+
+```python
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/ucf101/videos_train/'
+data_root_val = 'data/ucf101/videos_val/'
+ann_file_train = 'data/ucf101/ucf101_train_list.txt'
+ann_file_val = 'data/ucf101/ucf101_val_list.txt'
+```
+
+## 修改训练计划
+
+微调通常需要较小的学习率和较少的训练周期。
+
+```python
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=50,  # 将 100 修改为 50
+    val_begin=1,
+    val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,  # 将 100 修改为 50
+        by_epoch=True,
+        milestones=[20, 40],  # 修改 milestones
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.005, # 将 0.01 修改为 0.005
+        momentum=0.9,
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+```
+
+## 使用预训练模型
+
+为了在整个网络上使用预训练模型，新配置文件在 `load_from` 中添加了预训练模型的链接。我们在 `configs/_base_/default_runtime.py` 中设置 `load_from=None` 作为默认值，并且根据[继承设计](config.md)，用户可以通过在其配置中设置 `load_from` 来直接更改它。
+
+```python
+# use the pre-trained model for the whole TSN network
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'  # 模型路径可以在模型库中找到
+```
+
+## 开始训练
+
+现在，我们已经完成了微调的配置文件，如下所示：
+
+```python
+_base_ = [
+    '../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_100e.py',
+    '../../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=101  # 将 400 修改为 101
+        ))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/ucf101/videos_train/'
+data_root_val = 'data/ucf101/videos_val/'
+ann_file_train = 'data/ucf101/ucf101_train_list.txt'
+ann_file_val = 'data/ucf101/ucf101_val_list.txt'
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=3,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=25,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='TenCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=50,  # 将 100 修改为 50
+    val_begin=1,
+    val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,  # 将 100 修改为 50
+        by_epoch=True,
+        milestones=[20, 40],  # 修改 milestones
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.005, # 将 0.01 修改为 0.005
+        momentum=0.9,
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (32 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=256)
+
+# use the pre-trained model for the whole TSN network
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'
+
+```
+
+另一种更简单的方法是继承 kinetics400 配置，并只指定修改的键。请确保自定义配置与 `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` 在同一个文件夹中。
+
+```python
+_base_ = [
+    'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'  # 继承模板配置
+]
+
+# model settings
+model = dict(
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=101))  # 将 400 修改为 101
+
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/ucf101/videos_train/'
+data_root_val = 'data/ucf101/videos_val/'
+ann_file_train = 'data/ucf101/ucf101_train_list.txt'
+ann_file_val = 'data/ucf101/ucf101_val_list.txt'
+
+train_dataloader = dict(
+    dataset=dict(
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root)))
+val_dataloader = dict(
+    dataset=dict(
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val)))
+test_dataloader = dict(
+    dataset=dict(
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val)))
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=50,  # 将 100 修改为 50
+    val_begin=1,
+    val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,  # 将 100 修改为 50
+        by_epoch=True,
+        milestones=[20, 40],  # 修改 milestones
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.005, # 将 0.01 修改为 0.005
+        momentum=0.9,
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# use the pre-trained model for the whole TSN network
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'
+
+```
+
+你可以使用以下命令在你的数据集上微调模型。
+
+```shell
+python tools/train.py ${CONFIG_FILE} [可选参数]
+```
+
+例如：在确定性选项下，在 Kinetics-400 数据集上训练 TSN 模型。
+
+```shell
+python tools/train.py configs/recognition/tsn/tsn_ucf101.py  \
+  --seed=0 --deterministic
+```
+
+更多细节，请参考[训练和测试教程](train_test.md)中的**训练**部分。
diff --git a/docs/zh_cn/user_guides/inference.md b/docs/zh_cn/user_guides/inference.md
new file mode 100644
index 0000000000000000000000000000000000000000..d6b61091d074885c583234b113eb13b87f3d0360
--- /dev/null
+++ b/docs/zh_cn/user_guides/inference.md
@@ -0,0 +1,39 @@
+# 使用现有模型进行推理
+
+MMAction2 在[模型库](../modelzoo.md)中提供了预训练的视频理解模型。本文将展示如何使用现有模型对给定的视频进行推理。
+
+关于如何在标准数据集上测试现有模型，请参考这个[指南](./train_test.md#test)。
+
+## 对给定视频进行推理
+
+MMAction2 提供了用于对给定视频进行推理的高级 Python API：
+
+- [init_recognizer](mmaction.apis.init_recognizer): 使用配置文件和权重文件初始化一个识别器
+- [inference_recognizer](mmaction.apis.inference_recognizer): 对给定视频进行推理
+
+下面是一个使用 Kinitics-400 预训练权重构建模型并对给定视频进行推理的示例。
+
+```{note}
+如果您将 mmaction2 用作第三方包，您需要下载示例中的配置文件和演示视频。
+
+运行 'mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb --dest .' 下载所需的配置文件。
+
+运行 'wget https://github.com/open-mmlab/mmaction2/blob/main/demo/demo.mp4' 下载所需的演示视频。
+```
+
+```python
+from mmaction.apis import inference_recognizer, init_recognizer
+
+config_path = 'configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py'
+checkpoint_path = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth' # 可以是本地路径
+img_path = 'demo/demo.mp4'   # 您可以指定自己的图片路径
+
+# 从配置文件和权重文件中构建模型
+model = init_recognizer(config_path, checkpoint_path, device="cpu")  # device 可以是 'cuda:0'
+# 对单个视频进行测试
+result = inference_recognizer(model, img_path)
+```
+
+`result` 是一个包含 `pred_scores` 的字典。
+
+示例中的动作识别演示可以在[demo/demo.py](https://github.com/open-mmlab/mmaction2/blob/main/demo/demo.py)中找到。
diff --git a/docs/zh_cn/user_guides/prepare_dataset.md b/docs/zh_cn/user_guides/prepare_dataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..c8cd66fb432cdfa44fc05da7e532ac96e30fafdc
--- /dev/null
+++ b/docs/zh_cn/user_guides/prepare_dataset.md
@@ -0,0 +1,295 @@
+# 准备数据集
+
+MMAction2 支持许多现有的数据集。在本章中，我们将引导您准备 MMAction2 的数据集。
+
+- [准备数据集](#准备数据集)
+  - [关于视频数据格式的说明](#关于视频数据格式的说明)
+  - [使用内置数据集](#使用内置数据集)
+  - [使用自定义数据集](#使用自定义数据集)
+    - [动作识别](#动作识别)
+    - [基于骨骼的动作识别](#基于骨骼的动作识别)
+    - [基于音频的动作识别](#基于音频的动作识别)
+    - [时空动作检测](#时空动作检测)
+    - [时序动作定位](#时序动作定位)
+  - [使用混合数据集进行训练](#使用混合数据集进行训练)
+    - [重复数据集](#重复数据集)
+  - [浏览数据集](#浏览数据集)
+
+## 关于视频数据格式的说明
+
+MMAction2 支持两种类型的数据格式：原始帧和视频。前者在之前的项目（如 [TSN](https://github.com/yjxiong/temporal-segment-networks)）中被广泛使用。当 SSD 可用时，这种方法运行速度很快，但无法满足日益增长的数据集需求（例如，最新的 [Kinetics](https://www.deepmind.com/open-source/kinetics) 数据集有 65 万个视频，总帧数将占用几 TB 的空间）。后者可以节省空间，但必须在执行时进行计算密集型的视频解码。为了加快视频解码速度，我们支持几种高效的视频加载库，如 [decord](https://github.com/zhreshold/decord)、[PyAV](https://github.com/PyAV-Org/PyAV) 等。
+
+## 使用内置数据集
+
+MMAction2 已经支持许多数据集，我们在路径 `$MMACTION2/tools/data/` 下提供了用于数据准备的 shell 脚本，请参考[支持的数据集](https://mmaction2.readthedocs.io/zh_CN/latest/datasetzoo_statistics.html)以获取准备特定数据集的详细信息。
+
+## 使用自定义数据集
+
+最简单的方法是将您的数据集转换为现有的数据集格式：
+
+- `RawFrameDataset` 和 `VideoDataset` 用于[动作识别](#动作识别)
+- `PoseDataset` 用于[基于骨骼的动作识别](#基于骨骼的动作识别)
+- `AudioDataset` 用于[基于音频动作识别](#基于音频动作识别)
+- `AVADataset` 用于[时空动作检测](#时空动作检测)
+- `ActivityNetDataset` 用于[时序动作定位](#时序动作定位)
+
+在数据预处理之后，用户需要进一步修改配置文件以使用数据集。以下是在原始帧格式中使用自定义数据集的示例。
+
+在 `configs/task/method/my_custom_config.py` 中：
+
+```python
+...
+# 数据集设置
+dataset_type = 'RawframeDataset'
+data_root = 'path/to/your/root'
+data_root_val = 'path/to/your/root_val'
+ann_file_train = 'data/custom/custom_train_list.txt'
+ann_file_val = 'data/custom/custom_val_list.txt'
+ann_file_test = 'data/custom/custom_val_list.txt'
+...
+data = dict(
+    videos_per_gpu=32,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        ...),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        ...),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        ...))
+...
+```
+
+### 动作识别
+
+动作识别有两种类型的注释文件。
+
+- `RawFrameDataset` 的原始帧注释
+
+  原始帧数据集的注释是一个包含多行的文本文件，每一行表示一个视频的 `frame_directory`（相对路径）、视频的 `total_frames` 和视频的 `label`，用空格分隔。
+
+  以下是一个示例。
+
+  ```
+  some/directory-1 163 1
+  some/directory-2 122 1
+  some/directory-3 258 2
+  some/directory-4 234 2
+  some/directory-5 295 3
+  some/directory-6 121 3
+  ```
+
+- `VideoDataset` 的视频注释
+
+  视频数据集的注释是一个包含多行的文本文件，每一行表示一个样本视频，包括 `filepath`（相对路径）和 `label`，用空格分隔。
+
+  以下是一个示例。
+
+  ```
+  some/path/000.mp4 1
+  some/path/001.mp4 1
+  some/path/002.mp4 2
+  some/path/003.mp4 2
+  some/path/004.mp4 3
+  some/path/005.mp4 3
+  ```
+
+### 基于骨骼点的动作识别
+
+该任务基于骨骼序列（关键点的时间序列）识别动作类别。我们提供了一些方法来构建自定义的骨骼数据集。
+
+- 从 RGB 视频数据构建
+
+  您需要从视频中提取关键点数据，并将其转换为支持的格式。我们提供了一个[教程](https://github.com/open-mmlab/mmaction2/tree/main/configs/skeleton/posec3d/custom_dataset_training.md)，详细介绍了如何执行。
+
+- 从现有关键点数据构建
+
+  假设您已经有了 coco 格式的关键点数据，您可以将它们收集到一个 pickle 文件中。
+
+  每个 pickle 文件对应一个动作识别数据集。pickle 文件的内容是一个字典，包含两个字段：`split` 和 `annotations`
+
+  1. Split：`split` 字段的值是一个字典：键是拆分名称，值是属于特定剪辑的视频标识符列表。
+  2. Annotations：`annotations` 字段的值是一个骨骼注释列表，每个骨骼注释是一个字典，包含以下字段：
+     - `frame_dir`（str）：对应视频的标识符。
+     - `total_frames`（int）：此视频中的帧数。
+     - `img_shape`（tuple\[int\]）：视频帧的形状，一个包含两个元素的元组，格式为 `(height, width)`。仅对 2D 骨骼需要。
+     - `original_shape`（tuple\[int\]）：与 `img_shape` 相同。
+     - `label`（int）：动作标签。
+     - `keypoint`（np.ndarray，形状为 `[M x T x V x C]`）：关键点注释。
+       - M：人数；
+       - T：帧数（与 `total_frames` 相同）；
+       - V：关键点数量（NTURGB+D 3D 骨骼为 25，Coco 为 17，OpenPose 为 18 等）；
+       - C：关键点坐标的维数（2D 关键点为 C=2，3D 关键点为 C=3）。
+     - `keypoint_score`（np.ndarray，形状为 `[M x T x V]`）：关键点的置信度分数。仅对 2D 骨骼需要。
+
+  以下是一个示例：
+
+  ```
+  {
+      "split":
+          {
+              'xsub_train':
+                  ['S001C001P001R001A001', ...],
+              'xsub_val':
+                  ['S001C001P003R001A001', ...],
+              ...
+          }
+
+      "annotations:
+          [
+              {
+                  {
+                      'frame_dir': 'S001C001P001R001A001',
+                      'label': 0,
+                      'img_shape': (1080, 1920),
+                      'original_shape': (1080, 1920),
+                      'total_frames': 103,
+                      'keypoint': array([[[[1032. ,  334.8], ...]]])
+                      'keypoint_score': array([[[0.934 , 0.9766, ...]]])
+                  },
+                  {
+                      'frame_dir': 'S001C001P003R001A001',
+                      ...
+                  },
+                  ...
+
+              }
+          ]
+  }
+  ```
+
+  支持其他关键点格式需要进行进一步修改，请参考[自定义数据集](../advanced_guides/customize_dataset.md)。
+
+### 基于音频的动作识别
+
+MMAction2 支持基于 `AudioDataset` 的音频动作识别任务。该任务使用梅尔频谱特征作为输入, 注释文件格式示例如下：
+
+```
+ihWykL5mYRI.npy 300 153
+lumzQD42AN8.npy 240 321
+sWFRmD9Of4s.npy 250 250
+w_IpfgRsBVA.npy 300 356
+```
+
+每一行代表一个训练样本，以第一行为例，`ihWykL5mYRI.npy` 为梅尔频谱特征的文件名，`300` 为该梅尔频谱特征文件对应的原视频文件的总帧数，`153` 为类别标签。我们分以下两阶段生成所需要的梅尔频谱特征文件数据：
+
+首先，通过视频文件提取`音频文件`:
+
+```
+cd $MMACTION2
+python tools/data/extract_audio.py ${ROOT} ${DST_ROOT} [--ext ${EXT}] [--num-workers ${N_WORKERS}] \
+    [--level ${LEVEL}]
+```
+
+- `ROOT`: 视频的根目录。
+- `DST_ROOT`: 存放生成音频的根目录。
+- `EXT`: 视频的后缀名，如 `mp4`。
+- `N_WORKERS`: 使用的进程数量。
+
+下一步，从音频文件生成`梅尔频谱特征`:
+
+```
+cd $MMACTION2
+python tools/data/build_audio_features.py ${AUDIO_HOME_PATH} ${SPECTROGRAM_SAVE_PATH} [--level ${LEVEL}] \
+    [--ext $EXT] [--num-workers $N_WORKERS] [--part $PART]
+```
+
+- `AUDIO_HOME_PATH`: 音频文件的根目录。
+- `SPECTROGRAM_SAVE_PATH`: 存放生成音频特征的根目录。
+- `EXT`: 音频的后缀名，如 `m4a`。
+- `N_WORKERS`: 使用的进程数量。
+- `PART`: 将完整的解码任务分为几部分并执行其中一份。如 `2/5` 表示将所有待解码数据分成 5 份，并对其中的第 2 份进行解码。这一选项在用户有多台机器时发挥作用。
+
+### 时空动作检测
+
+MMAction2 支持基于 `AVADataset` 的时空动作检测任务。注释包含真实边界框和提议边界框。
+
+- 真实边界框
+  真实边界框是一个包含多行的 csv 文件，每一行是一个帧的检测样本，格式如下：
+
+  video_identifier, time_stamp, lt_x, lt_y, rb_x, rb_y, label, entity_id
+  每个字段的含义如下：
+  `video_identifier`：对应视频的标识符
+  `time_stamp`：当前帧的时间戳
+  `lt_x`：左上角点的规范化 x 坐标
+  `lt_y`：左上角点的规范化 y 坐标
+  `rb_y`：右下角点的规范化 x 坐标
+  `rb_y`：右下角点的规范化 y 坐标
+  `label`：动作标签
+  `entity_id`：一个唯一的整数，允许将此框与该视频相邻帧中描绘同一个人的其他框连接起来
+
+  以下是一个示例：
+
+  ```
+  _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,12,0
+  _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,74,0
+  ...
+  ```
+
+- 提议边界框
+  提议边界框是由一个人体检测器生成的 pickle 文件，通常需要在目标数据集上进行微调。pickle 文件包含一个带有以下数据结构的字典：
+
+  `{'video_identifier,time_stamp': bbox_info}`
+
+  video_identifier（str）：对应视频的标识符
+  time_stamp（int）：当前帧的时间戳
+  bbox_info（np.ndarray，形状为`[n, 5]`）：检测到的边界框，\<x1> \<y1> \<x2> \<y2> \<score>。x1、x2、y1、y2 是相对于帧大小归一化的值，范围为 0.0-1.0。
+
+### 时序动作定位
+
+我们支持基于 `ActivityNetDataset` 的时序动作定位。ActivityNet 数据集的注释是一个 json 文件。每个键是一个视频名，相应的值是视频的元数据和注释。
+
+以下是一个示例：
+
+```
+{
+  "video1": {
+      "duration_second": 211.53,
+      "duration_frame": 6337,
+      "annotations": [
+          {
+              "segment": [
+                  30.025882995319815,
+                  205.2318595943838
+              ],
+              "label": "Rock climbing"
+          }
+      ],
+      "feature_frame": 6336,
+      "fps": 30.0,
+      "rfps": 29.9579255898
+  },
+  "video2": {...
+  }
+  ...
+}
+```
+
+## 使用混合数据集进行训练
+
+MMAction2 还支持混合数据集进行训练。目前，它支持重复数据集。
+
+### 重复数据集
+
+我们使用 `RepeatDataset` 作为包装器来重复数据集。例如，假设原始数据集为 `Dataset_A`，要重复它，配置如下所示
+
+```python
+dataset_A_train = dict(
+        type='RepeatDataset',
+        times=N,
+        dataset=dict(  # 这是 Dataset_A 的原始配置
+            type='Dataset_A',
+            ...
+            pipeline=train_pipeline
+        )
+    )
+```
+
+## 浏览数据集
+
+即将推出...
diff --git a/docs/zh_cn/user_guides/train_test.md b/docs/zh_cn/user_guides/train_test.md
new file mode 100644
index 0000000000000000000000000000000000000000..1da2e3cf3b1c8c1f84e366f825fd09721c5e9294
--- /dev/null
+++ b/docs/zh_cn/user_guides/train_test.md
@@ -0,0 +1,248 @@
+# 训练与测试
+
+- [训练与测试](#训练与测试)
+  - [训练](#训练)
+    - [使用单个 GPU 进行训练](#使用单个-gpu-进行训练)
+    - [使用多个 GPU 进行训练](#使用多个-gpu-进行训练)
+    - [使用多台机器进行训练](#使用多台机器进行训练)
+      - [同一网络中的多台机器](#同一网络中的多台机器)
+      - [使用 slurm 管理的多台机器](#使用-slurm-管理的多台机器)
+  - [测试](#测试)
+    - [使用单个 GPU 进行测试](#使用单个-gpu-进行测试)
+    - [使用多个 GPU 进行测试](#使用多个-gpu-进行测试)
+    - [使用多台机器进行测试](#使用多台机器进行测试)
+      - [同一网络中的多台机器](#同一网络中的多台机器-1)
+      - [使用 slurm 管理的多台机器](#使用-slurm-管理的多台机器-1)
+
+## 训练
+
+### 使用单个 GPU 进行训练
+
+您可以使用 `tools/train.py` 在一台带有 CPU 和 GPU(可选) 的单机上训练模型。
+
+下面是脚本的完整用法：
+
+```shell
+python tools/train.py ${CONFIG_FILE} [ARGS]
+```
+
+````{note}
+默认情况下，MMAction2 更倾向于使用 GPU 而不是 CPU 进行训练。如果您想在 CPU 上训练模型，请清空 `CUDA_VISIBLE_DEVICES` 或将其设置为 -1 以使 GPU 对程序不可见。
+
+```bash
+CUDA_VISIBLE_DEVICES=-1 python tools/train.py ${CONFIG_FILE} [ARGS]
+```
+````
+
+| 参数                                  | 描述                                                                                                                                                                |
+| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `CONFIG_FILE`                         | 配置文件的路径。                                                                                                                                                    |
+| `--work-dir WORK_DIR`                 | 保存日志和权重的目标文件夹。默认为与配置文件相同名称的文件夹，位于 `./work_dirs` 下。                                                                               |
+| `--resume [RESUME]`                   | 恢复训练。如果指定了路径，则从该路径恢复，如果未指定，则尝试从最新的权重自动恢复。                                                                                  |
+| `--amp`                               | 启用自动混合精度训练。                                                                                                                                              |
+| `--no-validate`                       | **不建议使用**。在训练期间禁用权重评估。                                                                                                                            |
+| `--auto-scale-lr`                     | 根据实际批次大小和原始批次大小自动缩放学习率。                                                                                                                      |
+| `--seed`                              | 随机种子。                                                                                                                                                          |
+| `--diff-rank-seed`                    | 是否为不同的 rank 设置不同的种子。                                                                                                                                  |
+| `--deterministic`                     | 是否为 CUDNN 后端设置确定性选项。                                                                                                                                   |
+| `--cfg-options CFG_OPTIONS`           | 覆盖使用的配置中的某些设置，xxx=yyy 格式的键值对将合并到配置文件中。如果要覆盖的值是一个列表，则应采用 `key="[a,b]"` 或 `key=a,b` 的形式。该参数还允许嵌套的列表/元组值，例如 `key="[(a,b),(c,d)]"`。请注意，引号是必需的，且不允许有空格。 |
+| `--launcher {none,pytorch,slurm,mpi}` | 作业启动器的选项。默认为 `none`。                                                                                                                                   |
+
+### 使用多个 GPU 进行训练
+
+我们提供了一个 shell 脚本使用 `torch.distributed.launch` 来启动多个 GPU 的训练任务。
+
+```shell
+bash tools/dist_train.sh ${CONFIG} ${GPUS} [PY_ARGS]
+```
+
+| 参数       | 描述                                                                    |
+| ---------- | ----------------------------------------------------------------------- |
+| `CONFIG`   | 配置文件的路径。                                                        |
+| `GPUS`     | 要使用的 GPU 数量。                                                     |
+| `[PYARGS]` | `tools/train.py` 的其他可选参数，请参见[这里](#使用单个-gpu-进行训练)。 |
+
+您还可以通过环境变量来指定启动器的其他参数。例如，使用以下命令将启动器的通信端口更改为 29666：
+
+```shell
+PORT=29666 bash tools/dist_train.sh ${CONFIG} ${GPUS} [PY_ARGS]
+```
+
+如果您想启动多个训练作业并使用不同的 GPU，可以通过指定不同的端口和可见设备来启动它们。
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash tools/dist_train.sh ${CONFIG} 4 [PY_ARGS]
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash tools/dist_train.sh ${CONFIG} 4 [PY_ARGS]
+```
+
+### 使用多台机器进行训练
+
+#### 同一网络中的多台机器
+
+如果您使用以太网连接的多台机器启动训练作业，可以运行以下命令：
+
+在第一台机器上：
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS
+```
+
+在第二台机器上：
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS
+```
+
+需要指定以下额外的环境变量来训练或测试多台机器上的模型：
+
+| ENV_VARS      | 描述                                                             |
+| ------------- | ---------------------------------------------------------------- |
+| `NNODES`      | 机器的总数。默认为 1。                                           |
+| `NODE_RANK`   | 本地机器的索引。默认为 0。                                       |
+| `PORT`        | 通信端口，在所有机器上应该保持一致。默认为 29500。               |
+| `MASTER_ADDR` | 主机器的 IP 地址，在所有机器上应该保持一致。默认为 `127.0.0.1`。 |
+
+通常，如果您没有高速网络（如 InfiniBand），则速度会比较慢。
+
+#### 使用 slurm 管理的多台机器
+
+如果您在使用 [slurm](https://slurm.schedmd.com/) 管理的集群上运行 MMAction2，可以使用脚本 `slurm_train.sh`。
+
+```shell
+[ENV_VARS] bash tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG} [PY_ARGS]
+```
+
+下面是该脚本的参数描述。
+
+| 参数        | 描述                                                                    |
+| ----------- | ----------------------------------------------------------------------- |
+| `PARTITION` | 集群中要使用的分区。                                                    |
+| `JOB_NAME`  | 作业的名称，您可以自定义。                                              |
+| `CONFIG`    | 配置文件的路径。                                                        |
+| `[PYARGS]`  | `tools/train.py` 的其他可选参数，请参见[这里](#使用单个-gpu-进行训练)。 |
+
+下面列出了可用于配置 slurm 作业的环境变量。
+
+| ENV_VARS        | 描述                                                                             |
+| --------------- | -------------------------------------------------------------------------------- |
+| `GPUS`          | 要使用的 GPU 数量。默认为 8。                                                    |
+| `GPUS_PER_NODE` | 每个节点要分配的 GPU 数量。默认为 8。                                            |
+| `CPUS_PER_TASK` | 每个任务要分配的 CPU 数量（通常一个 GPU 对应一个任务）。默认为 5。               |
+| `SRUN_ARGS`     | `srun` 的其他参数。可用选项可在[这里](https://slurm.schedmd.com/srun.html)找到。 |
+
+## 测试
+
+### 使用单个 GPU 进行测试
+
+您可以使用 `tools/test.py` 在一台带有 CPU 和可选 GPU 的单机上测试模型。
+
+下面是脚本的完整用法：
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS]
+```
+
+````{note}
+默认情况下，MMAction2 更倾向于使用 GPU 而不是 CPU 进行测试。如果您想在 CPU 上测试模型，请清空 `CUDA_VISIBLE_DEVICES` 或将其设置为 -1 以使 GPU 对程序不可见。
+
+```bash
+CUDA_VISIBLE_DEVICES=-1 python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS]
+```
+````
+
+| 参数                                  | 描述                                                                                                                                                                |
+| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `CONFIG_FILE`                         | 配置文件的路径。                                                                                                                                                    |
+| `CHECKPOINT_FILE`                     | 权重文件的路径（可以是 HTTP 链接）。                                                                                                                                |
+| `--work-dir WORK_DIR`                 | 保存包含评估指标的文件的目录。默认为与配置文件相同名称的文件夹，位于 `./work_dirs` 下。                                                                             |
+| `--dump DUMP`                         | 存储模型的所有输出以进行离线评估的路径。                                                                                                                            |
+| `--cfg-options CFG_OPTIONS`           | 覆盖使用的配置中的某些设置，xxx=yyy 格式的键值对将合并到配置文件中。如果要覆盖的值是一个列表，则应采用 `key="[a,b]"` 或 `key=a,b` 的形式。该参数还允许嵌套的列表/元组值，例如 `key="[(a,b),(c,d)]"`。请注意，引号是必需的，且不允许有空格。 |
+| `--show-dir SHOW_DIR`                 | 保存结果可视化图片的目录。                                                                                                                                          |
+| `--show`                              | 在窗口中可视化预测结果。                                                                                                                                            |
+| `--interval INTERVAL`                 | 可视化的样本间隔。默认为 1。                                                                                                                                        |
+| `--wait-time WAIT_TIME`               | 每个窗口的显示时间（单位：秒）。默认为 2。                                                                                                                          |
+| `--launcher {none,pytorch,slurm,mpi}` | 作业启动器的选项。默认为 `none`。                                                                                                                                   |
+
+### 使用多个 GPU 进行测试
+
+我们提供了一个 shell 脚本使用 `torch.distributed.launch` 来启动多个 GPU 的测试任务。
+
+```shell
+bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} ${GPUS} [PY_ARGS]
+```
+
+| 参数         | 描述                                                                   |
+| ------------ | ---------------------------------------------------------------------- |
+| `CONFIG`     | 配置文件的路径。                                                       |
+| `CHECKPOINT` | 权重文件的路径（可以是 HTTP 链接）。                                   |
+| `GPUS`       | 要使用的 GPU 数量。                                                    |
+| `[PYARGS]`   | `tools/test.py` 的其他可选参数，请参见[这里](#使用单个-gpu-进行测试)。 |
+
+您还可以通过环境变量来指定启动器的其他参数。例如，使用以下命令将启动器的通信端口更改为 29666：
+
+```shell
+PORT=29666 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} ${GPUS} [PY_ARGS]
+```
+
+如果您想启动多个测试作业并使用不同的 GPU，可以通过指定不同的端口和可见设备来启动它们。
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} 4 [PY_ARGS]
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} 4 [PY_ARGS]
+```
+
+### 使用多台机器进行测试
+
+#### 同一网络中的多台机器
+
+如果您使用以太网连接的多台机器进行测试作业，可以运行以下命令：
+
+在第一台机器上：
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_test.sh $CONFIG $CHECKPOINT $GPUS
+```
+
+在第二台机器上：
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_test.sh $CONFIG $CHECKPOINT $GPUS
+```
+
+与单台机器上的多个 GPU 相比，您需要指定一些额外的环境变量：
+
+| ENV_VARS      | 描述                                                             |
+| ------------- | ---------------------------------------------------------------- |
+| `NNODES`      | 机器的总数。默认为 1。                                           |
+| `NODE_RANK`   | 本地机器的索引。默认为 0。                                       |
+| `PORT`        | 通信端口，在所有机器上应该保持一致。默认为 29500。               |
+| `MASTER_ADDR` | 主机器的 IP 地址，在所有机器上应该保持一致。默认为 `127.0.0.1`。 |
+
+通常，如果您没有高速网络（如 InfiniBand），则速度会比较慢。
+
+#### 使用 slurm 管理的多台机器
+
+如果您在使用 [slurm](https://slurm.schedmd.com/) 管理的集群上运行 MMAction2，可以使用脚本 `slurm_test.sh`。
+
+```shell
+[ENV_VARS] bash tools/slurm_test.sh ${PARTITION} ${JOB_NAME} ${CONFIG} ${CHECKPOINT} [PY_ARGS]
+```
+
+下面是该脚本的参数描述。
+
+| 参数         | 描述                                                                   |
+| ------------ | ---------------------------------------------------------------------- |
+| `PARTITION`  | 集群中要使用的分区。                                                   |
+| `JOB_NAME`   | 作业的名称，您可以自定义。                                             |
+| `CONFIG`     | 配置文件的路径。                                                       |
+| `CHECKPOINT` | 权重文件的路径（可以是 HTTP 链接）。                                   |
+| `[PYARGS]`   | `tools/test.py` 的其他可选参数，请参见[这里](#使用单个-gpu-进行测试)。 |
+
+下面列出了可用于配置 slurm 作业的环境变量。
+
+| ENV_VARS        | 描述                                                                             |
+| --------------- | -------------------------------------------------------------------------------- |
+| `GPUS`          | 要使用的 GPU 数量。默认为 8。                                                    |
+| `GPUS_PER_NODE` | 每个节点要分配的 GPU 数量。默认为 8。                                            |
+| `CPUS_PER_TASK` | 每个任务要分配的 CPU 数量（通常一个 GPU 对应一个任务）。默认为 5。               |
+| `SRUN_ARGS`     | `srun` 的其他参数。可用选项可在[这里](https://slurm.schedmd.com/srun.html)找到。 |
diff --git a/docs/zh_cn/utils.py b/docs/zh_cn/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..444e4c147d19d3f20686c81233d7ffc6e0821c19
--- /dev/null
+++ b/docs/zh_cn/utils.py
@@ -0,0 +1,28 @@
+import re
+from pathlib import Path
+
+
+def replace_link(pattern, template, content, file_path):
+    MMACT_ROOT = Path(__file__).absolute().parents[2]
+    GITHUB_PREFIX = 'https://github.com/open-mmlab/mmaction2/blob/main/'
+
+    def replace_core(matchobj):
+        name = matchobj.group(1)
+        link = matchobj.group(2)
+        if link.startswith('http') or link.startswith('#'):
+            return template.format(name, link)
+        # For link relative to project folder, such as '/configs/*/*.py'
+        elif Path(link).is_absolute():
+            link = link.lstrip('/')
+            folder = MMACT_ROOT
+        # For link relative to current file, such as './config/*.py'
+        else:
+            folder = file_path.parent
+        file_link = link.split('#')[0]
+        assert (folder / file_link).exists(), \
+            f'Link not found:\n{file_path}: {folder / link}'
+        rel_link = (folder / link).resolve().relative_to(MMACT_ROOT)
+        link = GITHUB_PREFIX + str(rel_link)
+        return template.format(name, link)
+
+    return re.sub(pattern, replace_core, content)
diff --git a/download_model.py b/download_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..47e6f4e15c3222a8654d919fe72eb471bcf206fe
--- /dev/null
+++ b/download_model.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""
+Script to download the TSN model checkpoint for GenVidBench
+"""
+
+import os
+import requests
+from tqdm import tqdm
+
+def download_file(url, filename):
+    """Download a file with progress bar"""
+    response = requests.get(url, stream=True)
+    total_size = int(response.headers.get('content-length', 0))
+    
+    with open(filename, 'wb') as f, tqdm(
+        desc=filename,
+        total=total_size,
+        unit='iB',
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as pbar:
+        for chunk in response.iter_content(chunk_size=8192):
+            size = f.write(chunk)
+            pbar.update(size)
+
+def main():
+    """Download the TSN model checkpoint"""
+    # Create checkpoints directory
+    os.makedirs('checkpoints', exist_ok=True)
+    
+    # Model checkpoint URL (you may need to update this with the actual URL)
+    checkpoint_url = "https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth"
+    checkpoint_path = "checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth"
+    
+    if os.path.exists(checkpoint_path):
+        print(f"Checkpoint already exists: {checkpoint_path}")
+        return
+    
+    print(f"Downloading TSN model checkpoint...")
+    print(f"URL: {checkpoint_url}")
+    print(f"Destination: {checkpoint_path}")
+    
+    try:
+        download_file(checkpoint_url, checkpoint_path)
+        print(f"✅ Successfully downloaded checkpoint to {checkpoint_path}")
+    except Exception as e:
+        print(f"❌ Error downloading checkpoint: {e}")
+        print("Please download the checkpoint manually and place it in the checkpoints/ directory")
+
+if __name__ == "__main__":
+    main()
diff --git a/fix_dependencies.py b/fix_dependencies.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecbebfa64a38023d7b425cf5c31622814128f300
--- /dev/null
+++ b/fix_dependencies.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+"""
+Script to fix dependency version conflicts
+"""
+
+import subprocess
+import sys
+
+def run_command(cmd):
+    """Run a command and return success status"""
+    try:
+        print(f"Running: {cmd}")
+        result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
+        print(f"✅ Success: {cmd}")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error: {cmd}")
+        print(f"Error output: {e.stderr}")
+        return False
+
+def main():
+    """Fix dependency issues"""
+    print("🔧 Fixing dependency version conflicts...")
+    
+    # Uninstall problematic packages
+    print("\n📦 Uninstalling conflicting packages...")
+    packages_to_remove = [
+        "mmcv",
+        "mmdet", 
+        "mmengine"
+    ]
+    
+    for package in packages_to_remove:
+        run_command(f"pip uninstall {package} -y")
+    
+    # Install compatible versions
+    print("\n📦 Installing compatible versions...")
+    compatible_packages = [
+        "mmcv>=2.0.0,<2.2.0",
+        "mmengine>=0.7.1,<1.0.0",
+        "mmdet>=3.0.0,<4.0.0"
+    ]
+    
+    for package in compatible_packages:
+        if not run_command(f"pip install {package}"):
+            print(f"⚠️  Failed to install {package}")
+    
+    print("\n✅ Dependency fix completed!")
+    print("Now run: python test_app.py")
+
+if __name__ == "__main__":
+    main()
diff --git a/hello.py b/hello.py
new file mode 100644
index 0000000000000000000000000000000000000000..8092d538522bc88816611207a639d87b2c50be04
--- /dev/null
+++ b/hello.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from genvidbench!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mmaction/__init__.py b/mmaction/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f701013f44991753496c110a3387d2f5120fc12
--- /dev/null
+++ b/mmaction/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import mmengine
+from mmengine.utils import digit_version
+
+from .version import __version__
+
+mmcv_minimum_version = '2.0.0rc4'
+mmcv_maximum_version = '2.2.0'
+mmcv_version = digit_version(mmcv.__version__)
+
+mmengine_minimum_version = '0.7.1'
+mmengine_maximum_version = '1.0.0'
+mmengine_version = digit_version(mmengine.__version__)
+
+assert (digit_version(mmcv_minimum_version) <= mmcv_version
+        < digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.'
+
+assert (digit_version(mmengine_minimum_version) <= mmengine_version
+        < digit_version(mmengine_maximum_version)), \
+    f'MMEngine=={mmengine.__version__} is used but incompatible. ' \
+    f'Please install mmengine>={mmengine_minimum_version}, ' \
+    f'<{mmengine_maximum_version}.'
+
+__all__ = ['__version__']
diff --git a/mmaction/__pycache__/__init__.cpython-312.pyc b/mmaction/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f01dfb0d30f2a2b652b646841c9abb24a233846
Binary files /dev/null and b/mmaction/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/__pycache__/registry.cpython-312.pyc b/mmaction/__pycache__/registry.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fcb85d8ad5ae446fbe93c5dd0de3d06313cebbdc
Binary files /dev/null and b/mmaction/__pycache__/registry.cpython-312.pyc differ
diff --git a/mmaction/__pycache__/version.cpython-312.pyc b/mmaction/__pycache__/version.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e69e5a7e4f570ff40bba3e3ae5c4352c0f46c562
Binary files /dev/null and b/mmaction/__pycache__/version.cpython-312.pyc differ
diff --git a/mmaction/apis/__init__.py b/mmaction/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ab8a6c762275b531f179bb6a61adc0d02fac839
--- /dev/null
+++ b/mmaction/apis/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .inference import (detection_inference, inference_recognizer,
+                        inference_skeleton, init_recognizer, pose_inference)
+from .inferencers import *  # NOQA
+
+__all__ = [
+    'init_recognizer', 'inference_recognizer', 'inference_skeleton',
+    'detection_inference', 'pose_inference'
+]
diff --git a/mmaction/apis/__pycache__/__init__.cpython-312.pyc b/mmaction/apis/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..078aa52d9f06f2faa441c258ed61d524290238bd
Binary files /dev/null and b/mmaction/apis/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/apis/__pycache__/inference.cpython-312.pyc b/mmaction/apis/__pycache__/inference.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae3489a9751edce1f6384bf384e9980e37e15a44
Binary files /dev/null and b/mmaction/apis/__pycache__/inference.cpython-312.pyc differ
diff --git a/mmaction/apis/inference.py b/mmaction/apis/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..692173f5aebf0233f8539edadfe6c20340b78baa
--- /dev/null
+++ b/mmaction/apis/inference.py
@@ -0,0 +1,295 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import mmengine
+import numpy as np
+import torch
+import torch.nn as nn
+from mmengine.dataset import Compose, pseudo_collate
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+from mmengine.structures import InstanceData
+from mmengine.utils import track_iter_progress
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+
+
+def init_recognizer(config: Union[str, Path, mmengine.Config],
+                    checkpoint: Optional[str] = None,
+                    device: Union[str, torch.device] = 'cuda:0') -> nn.Module:
+    """Initialize a recognizer from config file.
+
+    Args:
+        config (str or :obj:`Path` or :obj:`mmengine.Config`): Config file
+            path, :obj:`Path` or the config object.
+        checkpoint (str, optional): Checkpoint path/url. If set to None,
+            the model will not load any weights. Defaults to None.
+        device (str | torch.device): The desired device of returned
+            tensor. Defaults to ``'cuda:0'``.
+
+    Returns:
+        nn.Module: The constructed recognizer.
+    """
+    if isinstance(config, (str, Path)):
+        config = mmengine.Config.fromfile(config)
+    elif not isinstance(config, mmengine.Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+
+    init_default_scope(config.get('default_scope', 'mmaction'))
+
+    if hasattr(config.model, 'backbone') and config.model.backbone.get(
+            'pretrained', None):
+        config.model.backbone.pretrained = None
+    model = MODELS.build(config.model)
+
+    if checkpoint is not None:
+        load_checkpoint(model, checkpoint, map_location='cpu')
+    model.cfg = config
+    model.to(device)
+    model.eval()
+    return model
+
+
+def inference_recognizer(model: nn.Module,
+                         video: Union[str, dict],
+                         test_pipeline: Optional[Compose] = None
+                         ) -> ActionDataSample:
+    """Inference a video with the recognizer.
+
+    Args:
+        model (nn.Module): The loaded recognizer.
+        video (Union[str, dict]): The video file path or the results
+            dictionary (the input of pipeline).
+        test_pipeline (:obj:`Compose`, optional): The test pipeline.
+            If not specified, the test pipeline in the config will be
+            used. Defaults to None.
+
+    Returns:
+        :obj:`ActionDataSample`: The inference results. Specifically, the
+        predicted scores are saved at ``result.pred_score``.
+    """
+
+    if test_pipeline is None:
+        cfg = model.cfg
+        init_default_scope(cfg.get('default_scope', 'mmaction'))
+        test_pipeline_cfg = cfg.test_pipeline
+        test_pipeline = Compose(test_pipeline_cfg)
+
+    input_flag = None
+    if isinstance(video, dict):
+        input_flag = 'dict'
+    elif isinstance(video, str) and osp.exists(video):
+        if video.endswith('.npy'):
+            input_flag = 'audio'
+        else:
+            input_flag = 'video'
+    else:
+        raise RuntimeError(f'The type of argument `video` is not supported: '
+                           f'{type(video)}')
+
+    if input_flag == 'dict':
+        data = video
+    if input_flag == 'video':
+        data = dict(filename=video, label=-1, start_index=0, modality='RGB')
+    if input_flag == 'audio':
+        data = dict(
+            audio_path=video,
+            total_frames=len(np.load(video)),
+            start_index=0,
+            label=-1)
+
+    data = test_pipeline(data)
+    data = pseudo_collate([data])
+
+    # Forward the model
+    with torch.no_grad():
+        result = model.test_step(data)[0]
+
+    return result
+
+
+def inference_skeleton(model: nn.Module,
+                       pose_results: List[dict],
+                       img_shape: Tuple[int],
+                       test_pipeline: Optional[Compose] = None
+                       ) -> ActionDataSample:
+    """Inference a pose results with the skeleton recognizer.
+
+    Args:
+        model (nn.Module): The loaded recognizer.
+        pose_results (List[dict]): The pose estimation results dictionary
+            (the results of `pose_inference`)
+        img_shape (Tuple[int]): The original image shape used for inference
+            skeleton recognizer.
+        test_pipeline (:obj:`Compose`, optional): The test pipeline.
+            If not specified, the test pipeline in the config will be
+            used. Defaults to None.
+
+    Returns:
+        :obj:`ActionDataSample`: The inference results. Specifically, the
+        predicted scores are saved at ``result.pred_score``.
+    """
+    if test_pipeline is None:
+        cfg = model.cfg
+        init_default_scope(cfg.get('default_scope', 'mmaction'))
+        test_pipeline_cfg = cfg.test_pipeline
+        test_pipeline = Compose(test_pipeline_cfg)
+
+    h, w = img_shape
+    num_keypoint = pose_results[0]['keypoints'].shape[1]
+    num_frame = len(pose_results)
+    num_person = max([len(x['keypoints']) for x in pose_results])
+    fake_anno = dict(
+        frame_dict='',
+        label=-1,
+        img_shape=(h, w),
+        origin_shape=(h, w),
+        start_index=0,
+        modality='Pose',
+        total_frames=num_frame)
+
+    keypoint = np.zeros((num_frame, num_person, num_keypoint, 2),
+                        dtype=np.float16)
+    keypoint_score = np.zeros((num_frame, num_person, num_keypoint),
+                              dtype=np.float16)
+
+    for f_idx, frm_pose in enumerate(pose_results):
+        frm_num_persons = frm_pose['keypoints'].shape[0]
+        for p_idx in range(frm_num_persons):
+            keypoint[f_idx, p_idx] = frm_pose['keypoints'][p_idx]
+            keypoint_score[f_idx, p_idx] = frm_pose['keypoint_scores'][p_idx]
+
+    fake_anno['keypoint'] = keypoint.transpose((1, 0, 2, 3))
+    fake_anno['keypoint_score'] = keypoint_score.transpose((1, 0, 2))
+    return inference_recognizer(model, fake_anno, test_pipeline)
+
+
+def detection_inference(det_config: Union[str, Path, mmengine.Config,
+                                          nn.Module],
+                        det_checkpoint: str,
+                        frame_paths: List[str],
+                        det_score_thr: float = 0.9,
+                        det_cat_id: int = 0,
+                        device: Union[str, torch.device] = 'cuda:0',
+                        with_score: bool = False) -> tuple:
+    """Detect human boxes given frame paths.
+
+    Args:
+        det_config (Union[str, :obj:`Path`, :obj:`mmengine.Config`,
+            :obj:`torch.nn.Module`]):
+            Det config file path or Detection model object. It can be
+            a :obj:`Path`, a config object, or a module object.
+        det_checkpoint: Checkpoint path/url.
+        frame_paths (List[str]): The paths of frames to do detection inference.
+        det_score_thr (float): The threshold of human detection score.
+            Defaults to 0.9.
+        det_cat_id (int): The category id for human detection. Defaults to 0.
+        device (Union[str, torch.device]): The desired device of returned
+            tensor. Defaults to ``'cuda:0'``.
+        with_score (bool): Whether to append detection score after box.
+            Defaults to None.
+
+    Returns:
+        List[np.ndarray]: List of detected human boxes.
+        List[:obj:`DetDataSample`]: List of data samples, generally used
+            to visualize data.
+    """
+    try:
+        from mmdet.apis import inference_detector, init_detector
+        from mmdet.structures import DetDataSample
+    except (ImportError, ModuleNotFoundError):
+        raise ImportError('Failed to import `inference_detector` and '
+                          '`init_detector` from `mmdet.apis`. These apis are '
+                          'required in this inference api! ')
+    if isinstance(det_config, nn.Module):
+        model = det_config
+    else:
+        model = init_detector(
+            config=det_config, checkpoint=det_checkpoint, device=device)
+
+    results = []
+    data_samples = []
+    print('Performing Human Detection for each frame')
+    for frame_path in track_iter_progress(frame_paths):
+        det_data_sample: DetDataSample = inference_detector(model, frame_path)
+        pred_instance = det_data_sample.pred_instances.cpu().numpy()
+        bboxes = pred_instance.bboxes
+        scores = pred_instance.scores
+        # We only keep human detection bboxs with score larger
+        # than `det_score_thr` and category id equal to `det_cat_id`.
+        valid_idx = np.logical_and(pred_instance.labels == det_cat_id,
+                                   pred_instance.scores > det_score_thr)
+        bboxes = bboxes[valid_idx]
+        scores = scores[valid_idx]
+
+        if with_score:
+            bboxes = np.concatenate((bboxes, scores[:, None]), axis=-1)
+        results.append(bboxes)
+        data_samples.append(det_data_sample)
+
+    return results, data_samples
+
+
+def pose_inference(pose_config: Union[str, Path, mmengine.Config, nn.Module],
+                   pose_checkpoint: str,
+                   frame_paths: List[str],
+                   det_results: List[np.ndarray],
+                   device: Union[str, torch.device] = 'cuda:0') -> tuple:
+    """Perform Top-Down pose estimation.
+
+    Args:
+        pose_config (Union[str, :obj:`Path`, :obj:`mmengine.Config`,
+            :obj:`torch.nn.Module`]): Pose config file path or
+            pose model object. It can be a :obj:`Path`, a config object,
+            or a module object.
+        pose_checkpoint: Checkpoint path/url.
+        frame_paths (List[str]): The paths of frames to do pose inference.
+        det_results (List[np.ndarray]): List of detected human boxes.
+        device (Union[str, torch.device]): The desired device of returned
+            tensor. Defaults to ``'cuda:0'``.
+
+    Returns:
+        List[List[Dict[str, np.ndarray]]]: List of pose estimation results.
+        List[:obj:`PoseDataSample`]: List of data samples, generally used
+            to visualize data.
+    """
+    try:
+        from mmpose.apis import inference_topdown, init_model
+        from mmpose.structures import PoseDataSample, merge_data_samples
+    except (ImportError, ModuleNotFoundError):
+        raise ImportError('Failed to import `inference_topdown` and '
+                          '`init_model` from `mmpose.apis`. These apis '
+                          'are required in this inference api! ')
+    if isinstance(pose_config, nn.Module):
+        model = pose_config
+    else:
+        model = init_model(pose_config, pose_checkpoint, device)
+
+    results = []
+    data_samples = []
+    print('Performing Human Pose Estimation for each frame')
+    for f, d in track_iter_progress(list(zip(frame_paths, det_results))):
+        pose_data_samples: List[PoseDataSample] \
+            = inference_topdown(model, f, d[..., :4], bbox_format='xyxy')
+        pose_data_sample = merge_data_samples(pose_data_samples)
+        pose_data_sample.dataset_meta = model.dataset_meta
+        # make fake pred_instances
+        if not hasattr(pose_data_sample, 'pred_instances'):
+            num_keypoints = model.dataset_meta['num_keypoints']
+            pred_instances_data = dict(
+                keypoints=np.empty(shape=(0, num_keypoints, 2)),
+                keypoints_scores=np.empty(shape=(0, 17), dtype=np.float32),
+                bboxes=np.empty(shape=(0, 4), dtype=np.float32),
+                bbox_scores=np.empty(shape=(0), dtype=np.float32))
+            pose_data_sample.pred_instances = InstanceData(
+                **pred_instances_data)
+
+        poses = pose_data_sample.pred_instances.to_dict()
+        results.append(poses)
+        data_samples.append(pose_data_sample)
+
+    return results, data_samples
diff --git a/mmaction/apis/inferencers/__init__.py b/mmaction/apis/inferencers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..237fbcb4e661c8c65a0a454f0285ccadec9a0532
--- /dev/null
+++ b/mmaction/apis/inferencers/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .actionrecog_inferencer import ActionRecogInferencer
+from .mmaction2_inferencer import MMAction2Inferencer
+
+__all__ = ['ActionRecogInferencer', 'MMAction2Inferencer']
diff --git a/mmaction/apis/inferencers/__pycache__/__init__.cpython-312.pyc b/mmaction/apis/inferencers/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..278c1bb1068149ce03ab7bcac04b9edd5caec3f0
Binary files /dev/null and b/mmaction/apis/inferencers/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/apis/inferencers/__pycache__/actionrecog_inferencer.cpython-312.pyc b/mmaction/apis/inferencers/__pycache__/actionrecog_inferencer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4cd40b3d4850fb0d5e34fbb2786d625e6c07cd2
Binary files /dev/null and b/mmaction/apis/inferencers/__pycache__/actionrecog_inferencer.cpython-312.pyc differ
diff --git a/mmaction/apis/inferencers/__pycache__/mmaction2_inferencer.cpython-312.pyc b/mmaction/apis/inferencers/__pycache__/mmaction2_inferencer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2169cfe1e1c3156e9179cf172bf8ff03edeec46f
Binary files /dev/null and b/mmaction/apis/inferencers/__pycache__/mmaction2_inferencer.cpython-312.pyc differ
diff --git a/mmaction/apis/inferencers/actionrecog_inferencer.py b/mmaction/apis/inferencers/actionrecog_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e7b32174746a046e524c2b972f2615d04686ca2
--- /dev/null
+++ b/mmaction/apis/inferencers/actionrecog_inferencer.py
@@ -0,0 +1,361 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+from mmengine.dataset import Compose
+from mmengine.fileio import list_from_file
+from mmengine.infer.infer import BaseInferencer, ModelType
+from mmengine.registry import init_default_scope
+from mmengine.structures import InstanceData
+
+from mmaction.registry import INFERENCERS
+from mmaction.structures import ActionDataSample
+from mmaction.utils import ConfigType, get_str_type
+
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[InstanceData, InstanceList]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]
+
+
+@INFERENCERS.register_module(name='action-recognition')
+@INFERENCERS.register_module()
+class ActionRecogInferencer(BaseInferencer):
+    """The inferencer for action recognition.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. For example, it could be
+            "slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb" or
+            "configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py".
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        label_file (str, optional): label file for dataset.
+        input_format (str): Input video format, Choices are 'video',
+            'rawframes', 'array'. 'video' means input data is a video file,
+            'rawframes' means input data is a video frame folder, and 'array'
+            means input data is a np.ndarray. Defaults to 'video'.
+        pack_cfg (dict, optional): Config for `InferencerPackInput` to load
+            input. Defaults to empty dict.
+        scope (str, optional): The scope of the model. Defaults to "mmaction".
+    """
+
+    preprocess_kwargs: set = set()
+    forward_kwargs: set = set()
+    visualize_kwargs: set = {
+        'return_vis', 'show', 'wait_time', 'vid_out_dir', 'draw_pred', 'fps',
+        'out_type', 'target_resolution'
+    }
+    postprocess_kwargs: set = {
+        'print_result', 'pred_out_file', 'return_datasample'
+    }
+
+    def __init__(self,
+                 model: Union[ModelType, str],
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 label_file: Optional[str] = None,
+                 input_format: str = 'video',
+                 pack_cfg: dict = {},
+                 scope: Optional[str] = 'mmaction') -> None:
+        # A global counter tracking the number of videos processed, for
+        # naming of the output videos
+        self.num_visualized_vids = 0
+        self.input_format = input_format
+        self.pack_cfg = pack_cfg.copy()
+        init_default_scope(scope)
+        super().__init__(
+            model=model, weights=weights, device=device, scope=scope)
+
+        if label_file is not None:
+            self.visualizer.dataset_meta = dict(
+                classes=list_from_file(label_file))
+
+    def __call__(self,
+                 inputs: InputsType,
+                 return_datasamples: bool = False,
+                 batch_size: int = 1,
+                 return_vis: bool = False,
+                 show: bool = False,
+                 wait_time: int = 0,
+                 draw_pred: bool = True,
+                 vid_out_dir: str = '',
+                 out_type: str = 'video',
+                 print_result: bool = False,
+                 pred_out_file: str = '',
+                 target_resolution: Optional[Tuple[int]] = None,
+                 **kwargs) -> dict:
+        """Call the inferencer.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            return_datasamples (bool): Whether to return results as
+                :obj:`BaseDataElement`. Defaults to False.
+            batch_size (int): Inference batch size. Defaults to 1.
+            show (bool): Whether to display the visualization results in a
+                popup window. Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            vid_out_dir (str): Output directory of visualization results.
+                If left as empty, no file will be saved. Defaults to ''.
+            out_type (str): Output type of visualization results.
+                Defaults to 'video'.
+            print_result (bool): Whether to print the inference result w/o
+                visualization to the console. Defaults to False.
+            pred_out_file: File to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+            **kwargs: Other keyword arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+        Returns:
+            dict: Inference and visualization results.
+        """
+        return super().__call__(
+            inputs,
+            return_datasamples,
+            batch_size,
+            return_vis=return_vis,
+            show=show,
+            wait_time=wait_time,
+            draw_pred=draw_pred,
+            vid_out_dir=vid_out_dir,
+            print_result=print_result,
+            pred_out_file=pred_out_file,
+            out_type=out_type,
+            target_resolution=target_resolution,
+            **kwargs)
+
+    def _inputs_to_list(self, inputs: InputsType) -> list:
+        """Preprocess the inputs to a list. The main difference from mmengine
+        version is that we don't list a directory cause input could be a frame
+        folder.
+
+        Preprocess inputs to a list according to its type:
+
+        - list or tuple: return inputs
+        - str: return a list containing the string. The string
+              could be a path to file, a url or other types of string according
+              to the task.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+
+        Returns:
+            list: List of input for the :meth:`preprocess`.
+        """
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)
+
+    def _init_pipeline(self, cfg: ConfigType) -> Compose:
+        """Initialize the test pipeline."""
+        test_pipeline = cfg.test_dataloader.dataset.pipeline
+        # Alter data pipelines for decode
+        if self.input_format == 'array':
+            for i in range(len(test_pipeline)):
+                if 'Decode' in get_str_type(test_pipeline[i]['type']):
+                    test_pipeline[i] = dict(type='ArrayDecode')
+            test_pipeline = [
+                x for x in test_pipeline if 'Init' not in x['type']
+            ]
+        elif self.input_format == 'video':
+            if 'Init' not in get_str_type(test_pipeline[0]['type']):
+                test_pipeline = [dict(type='DecordInit')] + test_pipeline
+            else:
+                test_pipeline[0] = dict(type='DecordInit')
+            for i in range(len(test_pipeline)):
+                if 'Decode' in get_str_type(test_pipeline[i]['type']):
+                    test_pipeline[i] = dict(type='DecordDecode')
+        elif self.input_format == 'rawframes':
+            if 'Init' in get_str_type(test_pipeline[0]['type']):
+                test_pipeline = test_pipeline[1:]
+            for i in range(len(test_pipeline)):
+                if 'Decode' in get_str_type(test_pipeline[i]['type']):
+                    test_pipeline[i] = dict(type='RawFrameDecode')
+        # Alter data pipelines to close TTA, avoid OOM
+        # Use center crop instead of multiple crop
+        for i in range(len(test_pipeline)):
+            if get_str_type(
+                    test_pipeline[i]['type']) in ['ThreeCrop', 'TenCrop']:
+                test_pipeline[i]['type'] = 'CenterCrop'
+        # Use single clip for `Recognizer3D`
+        if cfg.model.type == 'Recognizer3D':
+            for i in range(len(test_pipeline)):
+                if get_str_type(test_pipeline[i]['type']) == 'SampleFrames':
+                    test_pipeline[i]['num_clips'] = 1
+        # Pack multiple types of input format
+        test_pipeline.insert(
+            0,
+            dict(
+                type='InferencerPackInput',
+                input_format=self.input_format,
+                **self.pack_cfg))
+
+        return Compose(test_pipeline)
+
+    def visualize(
+        self,
+        inputs: InputsType,
+        preds: PredType,
+        return_vis: bool = False,
+        show: bool = False,
+        wait_time: int = 0,
+        draw_pred: bool = True,
+        fps: int = 30,
+        out_type: str = 'video',
+        target_resolution: Optional[Tuple[int]] = None,
+        vid_out_dir: str = '',
+    ) -> Union[List[np.ndarray], None]:
+        """Visualize predictions.
+
+        Args:
+            inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer.
+            preds (List[Dict]): Predictions of the model.
+            return_vis (bool): Whether to return the visualization result.
+                Defaults to False.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            draw_pred (bool): Whether to draw prediction labels.
+                Defaults to True.
+            fps (int): Frames per second for saving video. Defaults to 4.
+            out_type (str): Output format type, choose from 'img', 'gif',
+                'video'. Defaults to ``'img'``.
+            target_resolution (Tuple[int], optional): Set to
+                (desired_width desired_height) to have resized frames. If
+                either dimension is None, the frames are resized by keeping
+                the existing aspect ratio. Defaults to None.
+            vid_out_dir (str): Output directory of visualization results.
+                If left as empty, no file will be saved. Defaults to ''.
+
+        Returns:
+            List[np.ndarray] or None: Returns visualization results only if
+            applicable.
+        """
+        if self.visualizer is None or (not show and vid_out_dir == ''
+                                       and not return_vis):
+            return None
+
+        if getattr(self, 'visualizer') is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+
+        results = []
+
+        for single_input, pred in zip(inputs, preds):
+            if isinstance(single_input, str):
+                frames = single_input
+                video_name = osp.basename(single_input)
+            elif isinstance(single_input, np.ndarray):
+                frames = single_input.copy()
+                video_num = str(self.num_visualized_vids).zfill(8)
+                video_name = f'{video_num}.mp4'
+            else:
+                raise ValueError('Unsupported input type: '
+                                 f'{type(single_input)}')
+
+            out_path = osp.join(vid_out_dir, video_name) if vid_out_dir != '' \
+                else None
+
+            visualization = self.visualizer.add_datasample(
+                video_name,
+                frames,
+                pred,
+                show_frames=show,
+                wait_time=wait_time,
+                draw_gt=False,
+                draw_pred=draw_pred,
+                fps=fps,
+                out_type=out_type,
+                out_path=out_path,
+                target_resolution=target_resolution,
+            )
+            results.append(visualization)
+            self.num_visualized_vids += 1
+
+        return results
+
+    def postprocess(
+        self,
+        preds: PredType,
+        visualization: Optional[List[np.ndarray]] = None,
+        return_datasample: bool = False,
+        print_result: bool = False,
+        pred_out_file: str = '',
+    ) -> Union[ResType, Tuple[ResType, np.ndarray]]:
+        """Process the predictions and visualization results from ``forward``
+        and ``visualize``.
+
+        This method should be responsible for the following tasks:
+
+        1. Convert datasamples into a json-serializable dict if needed.
+        2. Pack the predictions and visualization results and return them.
+        3. Dump or log the predictions.
+
+        Args:
+            preds (List[Dict]): Predictions of the model.
+            visualization (Optional[np.ndarray]): Visualized predictions.
+            return_datasample (bool): Whether to use Datasample to store
+                inference results. If False, dict will be used.
+            print_result (bool): Whether to print the inference result w/o
+                visualization to the console. Defaults to False.
+            pred_out_file: File to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Inference and visualization results with key ``predictions``
+            and ``visualization``.
+
+            - ``visualization`` (Any): Returned by :meth:`visualize`.
+            - ``predictions`` (dict or DataSample): Returned by
+                :meth:`forward` and processed in :meth:`postprocess`.
+                If ``return_datasample=False``, it usually should be a
+                json-serializable dict containing only basic data elements such
+                as strings and numbers.
+        """
+        result_dict = {}
+        results = preds
+        if not return_datasample:
+            results = []
+            for pred in preds:
+                result = self.pred2dict(pred)
+                results.append(result)
+        # Add video to the results after printing and dumping
+        result_dict['predictions'] = results
+        if print_result:
+            print(result_dict)
+        if pred_out_file != '':
+            mmengine.dump(result_dict, pred_out_file)
+        result_dict['visualization'] = visualization
+        return result_dict
+
+    def pred2dict(self, data_sample: ActionDataSample) -> Dict:
+        """Extract elements necessary to represent a prediction into a
+        dictionary. It's better to contain only basic data elements such as
+        strings and numbers in order to guarantee it's json-serializable.
+
+        Args:
+            data_sample (ActionDataSample): The data sample to be converted.
+
+        Returns:
+            dict: The output dictionary.
+        """
+        result = {}
+        result['pred_labels'] = data_sample.pred_label.tolist()
+        result['pred_scores'] = data_sample.pred_score.tolist()
+        return result
diff --git a/mmaction/apis/inferencers/mmaction2_inferencer.py b/mmaction/apis/inferencers/mmaction2_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a37a9bb9a3531a31fdf67683a161b82dc26b816
--- /dev/null
+++ b/mmaction/apis/inferencers/mmaction2_inferencer.py
@@ -0,0 +1,232 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+from mmengine.infer import BaseInferencer
+from mmengine.structures import InstanceData
+
+from mmaction.utils import ConfigType
+from .actionrecog_inferencer import ActionRecogInferencer
+
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[InstanceData, InstanceList]
+ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]
+
+
+class MMAction2Inferencer(BaseInferencer):
+    """MMAction2 Inferencer. It's a unified inferencer interface for video
+    analyse task, currently including: ActionRecog. and it can be used to
+    perform end-to-end action recognition inference.
+
+    Args:
+        rec (str, optional): Pretrained action recognition algorithm.
+            It's the path to the config file or the model name defined in
+            metafile. For example, it could be:
+
+            - model alias, e.g. ``'slowfast'``,
+            - config name, e.g. ``'slowfast_r50_8xb8-8x8x1-256e_kinetics400
+                -rgb'``,
+            - config path
+
+            Defaults to ``None``.
+        rec_weights (str, optional): Path to the custom checkpoint file of
+            the selected rec model. If it is not specified and "rec" is a model
+            name of metafile, the weights will be loaded from metafile.
+            Defaults to None.
+        device (str, optional): Device to run inference. For example,
+            it could be 'cuda' or 'cpu'. If None, the available
+            device will be automatically used. Defaults to None.
+        label_file (str, optional): label file for dataset.
+        input_format (str): Input video format, Choices are 'video',
+            'rawframes', 'array'. 'video' means input data is a video file,
+            'rawframes' means input data is a video frame folder, and 'array'
+            means input data is a np.ndarray. Defaults to 'video'.
+    """
+
+    preprocess_kwargs: set = set()
+    forward_kwargs: set = set()
+    visualize_kwargs: set = {
+        'return_vis', 'show', 'wait_time', 'vid_out_dir', 'draw_pred', 'fps',
+        'out_type', 'target_resolution'
+    }
+    postprocess_kwargs: set = {
+        'print_result', 'pred_out_file', 'return_datasample'
+    }
+
+    def __init__(self,
+                 rec: Optional[str] = None,
+                 rec_weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 label_file: Optional[str] = None,
+                 input_format: str = 'video') -> None:
+
+        if rec is None:
+            raise ValueError('rec algorithm should provided.')
+
+        self.visualizer = None
+        self.num_visualized_imgs = 0
+
+        if rec is not None:
+            self.actionrecog_inferencer = ActionRecogInferencer(
+                rec, rec_weights, device, label_file, input_format)
+            self.mode = 'rec'
+
+    def _init_pipeline(self, cfg: ConfigType) -> None:
+        pass
+
+    def forward(self, inputs: InputType, batch_size: int,
+                **forward_kwargs) -> PredType:
+        """Forward the inputs to the model.
+
+        Args:
+            inputs (InputsType): The inputs to be forwarded.
+            batch_size (int): Batch size. Defaults to 1.
+
+        Returns:
+            Dict: The prediction results. Possibly with keys "rec".
+        """
+        result = {}
+        if self.mode == 'rec':
+            predictions = self.actionrecog_inferencer(
+                inputs,
+                return_datasamples=True,
+                batch_size=batch_size,
+                **forward_kwargs)['predictions']
+            result['rec'] = [[p] for p in predictions]
+
+        return result
+
+    def visualize(self, inputs: InputsType, preds: PredType,
+                  **kwargs) -> List[np.ndarray]:
+        """Visualize predictions.
+
+        Args:
+            inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer.
+            preds (List[Dict]): Predictions of the model.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            fps (int): Frames per second for saving video. Defaults to 4.
+            out_type (str): Output format type, choose from 'img', 'gif',
+                'video'. Defaults to ``'img'``.
+            target_resolution (Tuple[int], optional): Set to
+                (desired_width desired_height) to have resized frames. If
+                either dimension is None, the frames are resized by keeping
+                the existing aspect ratio. Defaults to None.
+            vid_out_dir (str): Output directory of visualization results.
+                If left as empty, no file will be saved. Defaults to ''.
+        """
+
+        if 'rec' in self.mode:
+            return self.actionrecog_inferencer.visualize(
+                inputs, preds['rec'][0], **kwargs)
+
+    def __call__(
+        self,
+        inputs: InputsType,
+        batch_size: int = 1,
+        **kwargs,
+    ) -> dict:
+        """Call the inferencer.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer. It can be a path
+                to image / image directory, or an array, or a list of these.
+            return_datasamples (bool): Whether to return results as
+                :obj:`BaseDataElement`. Defaults to False.
+            batch_size (int): Batch size. Defaults to 1.
+            **kwargs: Key words arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+        Returns:
+            dict: Inference and visualization results.
+        """
+        (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        ) = self._dispatch_kwargs(**kwargs)
+
+        ori_inputs = self._inputs_to_list(inputs)
+
+        preds = self.forward(ori_inputs, batch_size, **forward_kwargs)
+
+        visualization = self.visualize(
+            ori_inputs, preds,
+            **visualize_kwargs)  # type: ignore  # noqa: E501
+        results = self.postprocess(preds, visualization, **postprocess_kwargs)
+        return results
+
+    def _inputs_to_list(self, inputs: InputsType) -> list:
+        """Preprocess the inputs to a list. The main difference from mmengine
+        version is that we don't list a directory cause input could be a frame
+        folder.
+
+        Preprocess inputs to a list according to its type:
+
+        - list or tuple: return inputs
+        - str: return a list containing the string. The string
+              could be a path to file, a url or other types of string according
+              to the task.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+
+        Returns:
+            list: List of input for the :meth:`preprocess`.
+        """
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)
+
+    def postprocess(self,
+                    preds: PredType,
+                    visualization: Optional[List[np.ndarray]] = None,
+                    print_result: bool = False,
+                    pred_out_file: str = ''
+                    ) -> Union[ResType, Tuple[ResType, np.ndarray]]:
+        """Postprocess predictions.
+
+        Args:
+            preds (Dict): Predictions of the model.
+            visualization (Optional[np.ndarray]): Visualized predictions.
+            print_result (bool): Whether to print the result.
+                Defaults to False.
+            pred_out_file (str): Output file name to store predictions
+                without images. Supported file formats are “json”, “yaml/yml”
+                and “pickle/pkl”. Defaults to ''.
+
+        Returns:
+            Dict or List[Dict]: Each dict contains the inference result of
+            each image. Possible keys are "rec_labels", "rec_scores"
+        """
+
+        result_dict = {}
+        pred_results = [{} for _ in range(len(next(iter(preds.values()))))]
+        if 'rec' in self.mode:
+            for i, rec_pred in enumerate(preds['rec']):
+                result = dict(rec_labels=[], rec_scores=[])
+                for rec_pred_instance in rec_pred:
+                    rec_dict_res = self.actionrecog_inferencer.pred2dict(
+                        rec_pred_instance)
+                    result['rec_labels'].append(rec_dict_res['pred_labels'])
+                    result['rec_scores'].append(rec_dict_res['pred_scores'])
+                pred_results[i].update(result)
+
+        result_dict['predictions'] = pred_results
+        if print_result:
+            print(result_dict)
+        if pred_out_file != '':
+            mmengine.dump(result_dict, pred_out_file)
+        result_dict['visualization'] = visualization
+        return result_dict
diff --git a/mmaction/configs/_base_/__init__.py b/mmaction/configs/_base_/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/mmaction/configs/_base_/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/_base_/default_runtime.py b/mmaction/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..f927f1859264091be607c0e1dac7ce4c6426ad7b
--- /dev/null
+++ b/mmaction/configs/_base_/default_runtime.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook, RuntimeInfoHook,
+                            SyncBuffersHook)
+from mmengine.runner import LogProcessor
+
+from mmaction.visualization import ActionVisualizer, LocalVisBackend
+
+default_scope = 'mmaction'
+
+default_hooks = dict(
+    runtime_info=dict(type=RuntimeInfoHook),
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=20, ignore_last=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, interval=1, save_best='auto'),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    sync_buffers=dict(type=SyncBuffersHook))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+
+log_processor = dict(type=LogProcessor, window_size=20, by_epoch=True)
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(type=ActionVisualizer, vis_backends=vis_backends)
+
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/mmaction/configs/_base_/models/__init__.py b/mmaction/configs/_base_/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/mmaction/configs/_base_/models/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/_base_/models/slowfast_r50.py b/mmaction/configs/_base_/models/slowfast_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..e01ce9268af64454a77729a3a531dd55dda9a1e7
--- /dev/null
+++ b/mmaction/configs/_base_/models/slowfast_r50.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             ResNet3dSlowFast, SlowFastHead)
+
+# model settings
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=ResNet3dSlowFast,
+        pretrained=None,
+        resample_rate=8,  # tau
+        speed_ratio=8,  # alpha
+        channel_ratio=8,  # beta_inv
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=True,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1),
+            norm_eval=False),
+        fast_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=False,
+            base_channels=8,
+            conv1_kernel=(5, 7, 7),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            norm_eval=False)),
+    cls_head=dict(
+        type=SlowFastHead,
+        in_channels=2304,  # 2048+256
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
diff --git a/mmaction/configs/_base_/models/slowonly_r50.py b/mmaction/configs/_base_/models/slowonly_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..b295ac30a48e2064cf3b40d818b5f37284ecdff9
--- /dev/null
+++ b/mmaction/configs/_base_/models/slowonly_r50.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmaction.models import (ActionDataPreprocessor, I3DHead, Recognizer3D,
+                             ResNet3dSlowOnly)
+
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=ResNet3dSlowOnly,
+        depth=50,
+        pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth',
+        lateral=False,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(0, 0, 1, 1),
+        norm_eval=False),
+    cls_head=dict(
+        type=I3DHead,
+        in_channels=2048,
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
diff --git a/mmaction/configs/_base_/models/swin_tiny.py b/mmaction/configs/_base_/models/swin_tiny.py
new file mode 100644
index 0000000000000000000000000000000000000000..147a67e1fef329ab8e156ba4ce27483d3aeaa196
--- /dev/null
+++ b/mmaction/configs/_base_/models/swin_tiny.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmaction.models import (ActionDataPreprocessor, I3DHead, Recognizer3D,
+                             SwinTransformer3D)
+
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=SwinTransformer3D,
+        arch='tiny',
+        pretrained=None,
+        pretrained2d=True,
+        patch_size=(2, 4, 4),
+        window_size=(8, 7, 7),
+        mlp_ratio=4.,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        patch_norm=True),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    cls_head=dict(
+        type=I3DHead,
+        in_channels=768,
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        average_clips='prob'))
diff --git a/mmaction/configs/recognition/slowfast/__init__.py b/mmaction/configs/recognition/slowfast/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/mmaction/configs/recognition/slowfast/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/recognition/slowfast/slowfast_r50_8xb8_4x16x1_256e_kinetics400_rgb.py b/mmaction/configs/recognition/slowfast/slowfast_r50_8xb8_4x16x1_256e_kinetics400_rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7c51d51f0979d2cbf7076d6201e2d7794445cd
--- /dev/null
+++ b/mmaction/configs/recognition/slowfast/slowfast_r50_8xb8_4x16x1_256e_kinetics400_rgb.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.models.slowfast_r50 import *
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               RandomResizedCrop, Resize, SampleFrames,
+                               ThreeCrop, VideoDataset)
+from mmaction.evaluation import AccMetric
+
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=SampleFrames, clip_len=32, frame_interval=2, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=10,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=ThreeCrop, crop_size=256),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=256, val_begin=1, val_interval=5)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+optim_wrapper = dict(
+    optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=1e-4),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=34,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=256,
+        eta_min=0,
+        by_epoch=True,
+        begin=0,
+        end=256)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=4, max_keep_ckpts=3),
+        logger=dict(interval=100)))
diff --git a/mmaction/configs/recognition/slowonly/__init__.py b/mmaction/configs/recognition/slowonly/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/mmaction/configs/recognition/slowonly/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/recognition/slowonly/slowonly_r50_8xb16_4x16x1_256e_kinetics400_rgb.py b/mmaction/configs/recognition/slowonly/slowonly_r50_8xb16_4x16x1_256e_kinetics400_rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..c73e8331f2ae22a00ec19bb704c2141fd35a60d5
--- /dev/null
+++ b/mmaction/configs/recognition/slowonly/slowonly_r50_8xb16_4x16x1_256e_kinetics400_rgb.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.models.slowonly_r50 import *
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               RandomResizedCrop, Resize, SampleFrames,
+                               ThreeCrop, VideoDataset)
+from mmaction.evaluation import AccMetric
+
+# model settings
+model.update(dict(backbone=dict(pretrained=None)))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=SampleFrames, clip_len=4, frame_interval=16, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=4,
+        frame_interval=16,
+        num_clips=1,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=4,
+        frame_interval=16,
+        num_clips=10,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=ThreeCrop, crop_size=256),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=256, val_begin=1, val_interval=5)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning policy
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.1, by_epoch=True, begin=0, end=34),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=222,
+        eta_min=0,
+        by_epoch=True,
+        begin=34,
+        end=256)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type=SGD, lr=0.2, momentum=0.9, weight_decay=1e-4),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# runtime settings
+default_hooks.update(dict(checkpoint=dict(interval=4, max_keep_ckpts=3)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/mmaction/configs/recognition/swin/__init__.py b/mmaction/configs/recognition/swin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/mmaction/configs/recognition/swin/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/recognition/swin/swin_base_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py b/mmaction/configs/recognition/swin/swin_base_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..c450525a93e548a68377fc059626b2868fcf8b87
--- /dev/null
+++ b/mmaction/configs/recognition/swin/swin_base_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.models.swin_tiny import *
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               RandomResizedCrop, Resize, SampleFrames,
+                               ThreeCrop, VideoDataset)
+from mmaction.engine import SwinOptimWrapperConstructor
+from mmaction.evaluation import AccMetric
+
+model.update(
+    dict(
+        backbone=dict(
+            arch='base',
+            drop_path_rate=0.3,
+            pretrained=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_base_patch4_window7_224.pth'  # noqa: E501
+        ),
+        cls_head=dict(in_channels=1024)))
+
+# dataset settings
+dataset_type = VideoDataset
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=SampleFrames, clip_len=32, frame_interval=2, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=4,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=30, val_begin=1, val_interval=3)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+optim_wrapper = dict(
+    type=AmpOptimWrapper,
+    optimizer=dict(type=AdamW, lr=1e-3, betas=(0.9, 0.999), weight_decay=0.05),
+    constructor=SwinOptimWrapperConstructor,
+    paramwise_cfg=dict(
+        absolute_pos_embed=dict(decay_mult=0.),
+        relative_position_bias_table=dict(decay_mult=0.),
+        norm=dict(decay_mult=0.),
+        backbone=dict(lr_mult=0.1)))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=2.5,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=30,
+        eta_min=0,
+        by_epoch=True,
+        begin=0,
+        end=30)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/mmaction/configs/recognition/swin/swin_large_p244_w877_in22k_pre_16xb8_amp_32x2x1_30e_kinetics700_rgb.py b/mmaction/configs/recognition/swin/swin_large_p244_w877_in22k_pre_16xb8_amp_32x2x1_30e_kinetics700_rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..021ddcbc188958689e838de4f5388b9e4e4765cb
--- /dev/null
+++ b/mmaction/configs/recognition/swin/swin_large_p244_w877_in22k_pre_16xb8_amp_32x2x1_30e_kinetics700_rgb.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from .swin_large_p244_w877_in22k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb import *  # noqa: E501
+
+model.update(dict(cls_head=dict(num_classes=700)))
+
+# dataset
+data_root = 'data/kinetics700/videos_train'
+data_root_val = 'data/kinetics700/videos_val'
+ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt'
+ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt'
+ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt'
+
+dataset_type = VideoDataset
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+optim_wrapper.update(dict(optimizer=dict(lr=2e-3)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (16 GPUs) x (8 samples per GPU).
+auto_scale_lr.update(dict(enable=False, base_batch_size=128))
diff --git a/mmaction/configs/recognition/swin/swin_large_p244_w877_in22k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py b/mmaction/configs/recognition/swin/swin_large_p244_w877_in22k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a2909b782f9842a73bccda3fdfa691698b9b2fa
--- /dev/null
+++ b/mmaction/configs/recognition/swin/swin_large_p244_w877_in22k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.models.swin_tiny import *
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               RandomResizedCrop, Resize, SampleFrames,
+                               ThreeCrop, VideoDataset)
+from mmaction.engine import SwinOptimWrapperConstructor
+from mmaction.evaluation import AccMetric
+
+model.update(
+    dict(
+        backbone=dict(
+            arch='large',
+            drop_path_rate=0.4,
+            pretrained=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_large_patch4_window7_224_22k.pth'  # noqa: E501
+        ),
+        cls_head=dict(in_channels=1536)))
+
+# dataset settings
+dataset_type = VideoDataset
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=SampleFrames, clip_len=32, frame_interval=2, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=4,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=30, val_begin=1, val_interval=3)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+optim_wrapper = dict(
+    type=AmpOptimWrapper,
+    optimizer=dict(type=AdamW, lr=1e-3, betas=(0.9, 0.999), weight_decay=0.05),
+    constructor=SwinOptimWrapperConstructor,
+    paramwise_cfg=dict(
+        absolute_pos_embed=dict(decay_mult=0.),
+        relative_position_bias_table=dict(decay_mult=0.),
+        norm=dict(decay_mult=0.),
+        backbone=dict(lr_mult=0.1)))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=2.5,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=30,
+        eta_min=0,
+        by_epoch=True,
+        begin=0,
+        end=30)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/mmaction/configs/recognition/swin/swin_small_p244_w877_in1k_pre_32xb4_amp_32x2x1_30e_kinetics710_rgb.py b/mmaction/configs/recognition/swin/swin_small_p244_w877_in1k_pre_32xb4_amp_32x2x1_30e_kinetics710_rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..a27940cf4133c6e82f396749fc67645065847952
--- /dev/null
+++ b/mmaction/configs/recognition/swin/swin_small_p244_w877_in1k_pre_32xb4_amp_32x2x1_30e_kinetics710_rgb.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from .swin_small_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb import *  # noqa: E501
+
+from mmengine.dataset import DefaultSampler
+from torch.utils.data import ConcatDataset
+
+model.update(dict(cls_head=dict(num_classes=710)))
+
+k400_data_root = 'data/kinetics400/videos_train'
+k600_data_root = 'data/kinetics600/videos'
+k700_data_root = 'data/kinetics700/videos'
+k400_data_root_val = 'data/kinetics400/videos_val'
+k600_data_root_val = k600_data_root
+k700_data_root_val = k700_data_root
+
+k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt'
+k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt'
+k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt'
+
+k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt'
+k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt'
+k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt'
+
+k400_trainset = dict(
+    type=VideoDataset,
+    ann_file=k400_ann_file_train,
+    data_prefix=dict(video=k400_data_root),
+    pipeline=train_pipeline)
+k600_trainset = dict(
+    type=VideoDataset,
+    ann_file=k600_ann_file_train,
+    data_prefix=dict(video=k600_data_root),
+    pipeline=train_pipeline)
+k700_trainset = dict(
+    type=VideoDataset,
+    ann_file=k700_ann_file_train,
+    data_prefix=dict(video=k700_data_root),
+    pipeline=train_pipeline)
+
+k400_valset = dict(
+    type=VideoDataset,
+    ann_file=k400_ann_file_val,
+    data_prefix=dict(video=k400_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k600_valset = dict(
+    type=VideoDataset,
+    ann_file=k600_ann_file_val,
+    data_prefix=dict(video=k600_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k700_valset = dict(
+    type=VideoDataset,
+    ann_file=k700_ann_file_val,
+    data_prefix=dict(video=k700_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+
+k400_testset = k400_valset.copy()
+k600_testset = k600_valset.copy()
+k700_testset = k700_valset.copy()
+k400_testset['pipeline'] = test_pipeline
+k600_testset['pipeline'] = test_pipeline
+k700_testset['pipeline'] = test_pipeline
+
+k710_trainset = dict(
+    type=ConcatDataset,
+    datasets=[k400_trainset, k600_trainset, k700_trainset],
+    _delete_=True)
+k710_valset = dict(
+    type=ConcatDataset,
+    datasets=[k400_valset, k600_valset, k700_valset],
+    _delete_=True)
+k710_testset = dict(
+    type=ConcatDataset,
+    datasets=[k400_testset, k600_testset, k700_testset],
+    _delete_=True,
+)
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=k710_trainset)
+val_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=k710_valset)
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=k710_testset)
+
+optim_wrapper.update(dict(optimizer=dict(lr=2e-3)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (16 GPUs) x (8 samples per GPU).
+auto_scale_lr.update(dict(enable=False, base_batch_size=128))
diff --git a/mmaction/configs/recognition/swin/swin_small_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py b/mmaction/configs/recognition/swin/swin_small_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..769bc9b249f58510d6f969a133aea7b0969b1ba4
--- /dev/null
+++ b/mmaction/configs/recognition/swin/swin_small_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.models.swin_tiny import *
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               RandomResizedCrop, Resize, SampleFrames,
+                               ThreeCrop, VideoDataset)
+from mmaction.engine import SwinOptimWrapperConstructor
+from mmaction.evaluation import AccMetric
+
+model.update(
+    dict(
+        backbone=dict(
+            arch='small',
+            drop_path_rate=0.2,
+            pretrained=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_small_patch4_window7_224.pth'  # noqa: E501
+        )))
+
+# dataset settings
+dataset_type = VideoDataset
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=SampleFrames, clip_len=32, frame_interval=2, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=4,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=30, val_begin=1, val_interval=3)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+optim_wrapper = dict(
+    type=AmpOptimWrapper,
+    optimizer=dict(type=AdamW, lr=1e-3, betas=(0.9, 0.999), weight_decay=0.02),
+    constructor=SwinOptimWrapperConstructor,
+    paramwise_cfg=dict(
+        absolute_pos_embed=dict(decay_mult=0.),
+        relative_position_bias_table=dict(decay_mult=0.),
+        norm=dict(decay_mult=0.),
+        backbone=dict(lr_mult=0.1)))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=2.5,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=30,
+        eta_min=0,
+        by_epoch=True,
+        begin=0,
+        end=30)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/mmaction/configs/recognition/swin/swin_tiny_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py b/mmaction/configs/recognition/swin/swin_tiny_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..7afd26a65ac83d98fa18b125852fb55b588eca58
--- /dev/null
+++ b/mmaction/configs/recognition/swin/swin_tiny_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.models.swin_tiny import *
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               RandomResizedCrop, Resize, SampleFrames,
+                               ThreeCrop, VideoDataset)
+from mmaction.engine import SwinOptimWrapperConstructor
+from mmaction.evaluation import AccMetric
+
+model.update(
+    dict(
+        backbone=dict(
+            pretrained=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_tiny_patch4_window7_224.pth'  # noqa: E501
+        )))
+
+# dataset settings
+dataset_type = VideoDataset
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=SampleFrames, clip_len=32, frame_interval=2, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=4,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=30, val_begin=1, val_interval=3)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+optim_wrapper = dict(
+    type=AmpOptimWrapper,
+    optimizer=dict(type=AdamW, lr=1e-3, betas=(0.9, 0.999), weight_decay=0.02),
+    constructor=SwinOptimWrapperConstructor,
+    paramwise_cfg=dict(
+        absolute_pos_embed=dict(decay_mult=0.),
+        relative_position_bias_table=dict(decay_mult=0.),
+        norm=dict(decay_mult=0.),
+        backbone=dict(lr_mult=0.1)))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=2.5,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=30,
+        eta_min=0,
+        by_epoch=True,
+        begin=0,
+        end=30)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/mmaction/configs/recognition/uniformerv2/__init__.py b/mmaction/configs/recognition/uniformerv2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2_base_p16_res224_clip_kinetics710_pre_u8_kinetics400_rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2_base_p16_res224_clip_kinetics710_pre_u8_kinetics400_rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e6b335d2affd4b7abbbbe7aa69582b48e735832
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2_base_p16_res224_clip_kinetics710_pre_u8_kinetics400_rgb.py
@@ -0,0 +1,185 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='backbone.')),
+    cls_head=dict(
+        type=UniFormerHead,
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=768,
+        average_clips='prob',
+        channel_map=  # noqa: E251
+        'configs/recognition/uniformerv2/k710_channel_map/map_k400.json',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='cls_head.')),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+base_lr = 2e-6
+optim_wrapper = dict(
+    optimizer=dict(
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=4,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=1,
+        end=5,
+        convert_to_iter_based=True)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/mmaction/configs/skeleton/posec3d/__init__.py b/mmaction/configs/skeleton/posec3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/mmaction/configs/skeleton/posec3d/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/skeleton/posec3d/slowonly_r50_8xb16_u48_240e_ntu60_xsub_limb.py b/mmaction/configs/skeleton/posec3d/slowonly_r50_8xb16_u48_240e_ntu60_xsub_limb.py
new file mode 100644
index 0000000000000000000000000000000000000000..17e86dd5c91388133f4e89babf3b48a3d707457f
--- /dev/null
+++ b/mmaction/configs/skeleton/posec3d/slowonly_r50_8xb16_u48_240e_ntu60_xsub_limb.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler, RepeatDataset
+from mmengine.optim import CosineAnnealingLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmaction.datasets import (CenterCrop, Flip, FormatShape,
+                               GeneratePoseTarget, PackActionInputs,
+                               PoseCompact, PoseDataset, PoseDecode,
+                               RandomResizedCrop, Resize, UniformSampleFrames)
+from mmaction.evaluation import AccMetric
+from mmaction.models import I3DHead, Recognizer3D, ResNet3dSlowOnly
+
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=ResNet3dSlowOnly,
+        depth=50,
+        pretrained=None,
+        in_channels=17,
+        base_channels=32,
+        num_stages=3,
+        out_indices=(2, ),
+        stage_blocks=(4, 6, 3),
+        conv1_stride_s=1,
+        pool1_stride_s=1,
+        inflate=(0, 1, 1),
+        spatial_strides=(2, 2, 2),
+        temporal_strides=(1, 1, 2),
+        dilations=(1, 1, 1)),
+    cls_head=dict(
+        type=I3DHead,
+        in_channels=512,
+        num_classes=60,
+        dropout_ratio=0.5,
+        average_clips='prob'))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
+right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
+skeletons = [[0, 5], [0, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11],
+             [11, 13], [13, 15], [6, 12], [12, 14], [14, 16], [0, 1], [0, 2],
+             [1, 3], [2, 4], [11, 12]]
+left_limb = [0, 2, 3, 6, 7, 8, 12, 14]
+right_limb = [1, 4, 5, 9, 10, 11, 13, 15]
+train_pipeline = [
+    dict(type=UniformSampleFrames, clip_len=48),
+    dict(type=PoseDecode),
+    dict(type=PoseCompact, hw_ratio=1., allow_imgpad=True),
+    dict(type=Resize, scale=(-1, 64)),
+    dict(type=RandomResizedCrop, area_range=(0.56, 1.0)),
+    dict(type=Resize, scale=(56, 56), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp),
+    dict(
+        type=GeneratePoseTarget,
+        sigma=0.6,
+        use_score=True,
+        with_kp=False,
+        with_limb=True,
+        skeletons=skeletons),
+    dict(type=FormatShape, input_format='NCTHW_Heatmap'),
+    dict(type=PackActionInputs)
+]
+val_pipeline = [
+    dict(type=UniformSampleFrames, clip_len=48, num_clips=1, test_mode=True),
+    dict(type=PoseDecode),
+    dict(type=PoseCompact, hw_ratio=1., allow_imgpad=True),
+    dict(type=Resize, scale=(-1, 64)),
+    dict(type=CenterCrop, crop_size=64),
+    dict(
+        type=GeneratePoseTarget,
+        sigma=0.6,
+        use_score=True,
+        with_kp=False,
+        with_limb=True,
+        skeletons=skeletons),
+    dict(type=FormatShape, input_format='NCTHW_Heatmap'),
+    dict(type=PackActionInputs)
+]
+test_pipeline = [
+    dict(type=UniformSampleFrames, clip_len=48, num_clips=10, test_mode=True),
+    dict(type=PoseDecode),
+    dict(type=PoseCompact, hw_ratio=1., allow_imgpad=True),
+    dict(type=Resize, scale=(-1, 64)),
+    dict(type=CenterCrop, crop_size=64),
+    dict(
+        type=GeneratePoseTarget,
+        sigma=0.6,
+        use_score=True,
+        with_kp=False,
+        with_limb=True,
+        skeletons=skeletons,
+        double=True,
+        left_limb=left_limb,
+        right_limb=right_limb),
+    dict(type=FormatShape, input_format='NCTHW_Heatmap'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=10,
+        dataset=dict(
+            type=PoseDataset,
+            ann_file=ann_file,
+            split='xsub_train',
+            pipeline=train_pipeline)))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=PoseDataset,
+        ann_file=ann_file,
+        split='xsub_val',
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=PoseDataset,
+        ann_file=ann_file,
+        split='xsub_val',
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = [dict(type=AccMetric)]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=24, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+param_scheduler = [
+    dict(
+        type=CosineAnnealingLR,
+        eta_min=0,
+        T_max=24,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type=SGD, lr=0.2, momentum=0.9, weight_decay=0.0003),
+    clip_grad=dict(max_norm=40, norm_type=2))
diff --git a/mmaction/datasets/__init__.py b/mmaction/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2823fe8dc7d1f0d2f86e6cb7c885ea6e29ded93e
--- /dev/null
+++ b/mmaction/datasets/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .activitynet_dataset import ActivityNetDataset
+from .audio_dataset import AudioDataset
+from .ava_dataset import AVADataset, AVAKineticsDataset
+from .base import BaseActionDataset
+from .charades_sta_dataset import CharadesSTADataset
+from .msrvtt_datasets import MSRVTTVQA, MSRVTTVQAMC, MSRVTTRetrieval
+from .pose_dataset import PoseDataset
+from .rawframe_dataset import RawframeDataset
+from .repeat_aug_dataset import RepeatAugDataset, repeat_pseudo_collate
+from .transforms import *  # noqa: F401, F403
+from .video_dataset import VideoDataset
+from .video_text_dataset import VideoTextDataset
+
+__all__ = [
+    'AVADataset', 'AVAKineticsDataset', 'ActivityNetDataset', 'AudioDataset',
+    'BaseActionDataset', 'PoseDataset', 'RawframeDataset', 'RepeatAugDataset',
+    'VideoDataset', 'repeat_pseudo_collate', 'VideoTextDataset',
+    'MSRVTTRetrieval', 'MSRVTTVQA', 'MSRVTTVQAMC', 'CharadesSTADataset'
+]
diff --git a/mmaction/datasets/activitynet_dataset.py b/mmaction/datasets/activitynet_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5f7a4dde26c46a8ee2029a035eb2852a63d85f4
--- /dev/null
+++ b/mmaction/datasets/activitynet_dataset.py
@@ -0,0 +1,93 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, List, Optional, Union
+
+import mmengine
+from mmengine.fileio import exists
+
+from mmaction.registry import DATASETS
+from mmaction.utils import ConfigType
+from .base import BaseActionDataset
+
+
+@DATASETS.register_module()
+class ActivityNetDataset(BaseActionDataset):
+    """ActivityNet dataset for temporal action localization. The dataset loads
+    raw features and apply specified transforms to return a dict containing the
+    frame tensors and other information. The ann_file is a json file with
+    multiple objects, and each object has a key of the name of a video, and
+    value of total frames of the video, total seconds of the video, annotations
+    of a video, feature frames (frames covered by features) of the video, fps
+    and rfps. Example of a annotation file:
+
+    .. code-block:: JSON
+        {
+            "v_--1DO2V4K74":  {
+                "duration_second": 211.53,
+                "duration_frame": 6337,
+                "annotations": [
+                    {
+                        "segment": [
+                            30.025882995319815,
+                            205.2318595943838
+                        ],
+                        "label": "Rock climbing"
+                    }
+                ],
+                "feature_frame": 6336,
+                "fps": 30.0,
+                "rfps": 29.9579255898
+            },
+            "v_--6bJUbfpnQ": {
+                "duration_second": 26.75,
+                "duration_frame": 647,
+                "annotations": [
+                    {
+                        "segment": [
+                            2.578755070202808,
+                            24.914101404056165
+                        ],
+                        "label": "Drinking beer"
+                    }
+                ],
+                "feature_frame": 624,
+                "fps": 24.0,
+                "rfps": 24.1869158879
+            },
+            ...
+        }
+    Args:
+        ann_file (str): Path to the annotation file.
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        data_prefix (dict or ConfigDict): Path to a directory where videos are
+            held. Defaults to ``dict(video='')``.
+        test_mode (bool): Store True when building test or validation dataset.
+            Default: False.
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 pipeline: List[Union[dict, Callable]],
+                 data_prefix: Optional[ConfigType] = dict(video=''),
+                 test_mode: bool = False,
+                 **kwargs):
+
+        super().__init__(
+            ann_file,
+            pipeline=pipeline,
+            data_prefix=data_prefix,
+            test_mode=test_mode,
+            **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation file to get video information."""
+        exists(self.ann_file)
+        data_list = []
+        anno_database = mmengine.load(self.ann_file)
+        for video_name in anno_database:
+            video_info = anno_database[video_name]
+            feature_path = video_name + '.csv'
+            feature_path = '%s/%s' % (self.data_prefix['video'], feature_path)
+            video_info['feature_path'] = feature_path
+            video_info['video_name'] = video_name
+            data_list.append(video_info)
+        return data_list
diff --git a/mmaction/datasets/audio_dataset.py b/mmaction/datasets/audio_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..30a30f08e645c90a27fc602f14d467a747533cae
--- /dev/null
+++ b/mmaction/datasets/audio_dataset.py
@@ -0,0 +1,83 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Callable, Dict, List, Optional, Union
+
+from mmengine.utils import check_file_exist
+
+from mmaction.registry import DATASETS
+from .base import BaseActionDataset
+
+
+@DATASETS.register_module()
+class AudioDataset(BaseActionDataset):
+    """Audio dataset for action recognition.
+
+    The ann_file is a text file with multiple lines, and each line indicates
+    a sample audio or extracted audio feature with the filepath, total frames
+    of the raw video and label, which are split with a whitespace.
+    Example of a annotation file:
+
+    .. code-block:: txt
+        some/directory-1.npy 163 1
+        some/directory-2.npy 122 1
+        some/directory-3.npy 258 2
+        some/directory-4.npy 234 2
+        some/directory-5.npy 295 3
+        some/directory-6.npy 121 3
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        data_prefix (dict): Path to a directory where
+            audios are held. Defaults to ``dict(audio='')``.
+        multi_class (bool): Determines whether it is a multi-class
+            recognition dataset. Defaults to False.
+        num_classes (int, optional): Number of classes in the dataset.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 pipeline: List[Union[Dict, Callable]],
+                 data_prefix: Dict = dict(audio=''),
+                 multi_class: bool = False,
+                 num_classes: Optional[int] = None,
+                 **kwargs) -> None:
+        super().__init__(
+            ann_file,
+            pipeline,
+            data_prefix=data_prefix,
+            multi_class=multi_class,
+            num_classes=num_classes,
+            modality='Audio',
+            **kwargs)
+
+    def load_data_list(self) -> List[Dict]:
+        """Load annotation file to get audio information."""
+        check_file_exist(self.ann_file)
+        data_list = []
+        with open(self.ann_file, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                video_info = {}
+                idx = 0
+                filename = line_split[idx]
+                if self.data_prefix['audio'] is not None:
+                    filename = osp.join(self.data_prefix['audio'], filename)
+                video_info['audio_path'] = filename
+                idx += 1
+                # idx for total_frames
+                video_info['total_frames'] = int(line_split[idx])
+                idx += 1
+                # idx for label
+                label = [int(x) for x in line_split[idx:]]
+                assert label, f'missing label in line: {line}'
+                if self.multi_class:
+                    assert self.num_classes is not None
+                    video_info['label'] = label
+                else:
+                    assert len(label) == 1
+                    video_info['label'] = label[0]
+                data_list.append(video_info)
+
+        return data_list
diff --git a/mmaction/datasets/ava_dataset.py b/mmaction/datasets/ava_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..89657b84256d17594a85ee159f0109e8565bbac4
--- /dev/null
+++ b/mmaction/datasets/ava_dataset.py
@@ -0,0 +1,651 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import defaultdict
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+from mmengine.fileio import exists, list_from_file, load
+from mmengine.logging import MMLogger
+
+from mmaction.evaluation import read_labelmap
+from mmaction.registry import DATASETS
+from mmaction.utils import ConfigType
+from .base import BaseActionDataset
+
+
+@DATASETS.register_module()
+class AVADataset(BaseActionDataset):
+    """STAD dataset for spatial temporal action detection.
+
+    The dataset loads raw frames/video files, bounding boxes,
+    proposals and applies specified transformations to return
+    a dict containing the frame tensors and other information.
+
+    This datasets can load information from the following files:
+
+    .. code-block:: txt
+
+        ann_file -> ava_{train, val}_{v2.1, v2.2}.csv
+        exclude_file -> ava_{train, val}_excluded_timestamps_{v2.1, v2.2}.csv
+        label_file -> ava_action_list_{v2.1, v2.2}.pbtxt /
+                      ava_action_list_{v2.1, v2.2}_for_activitynet_2019.pbtxt
+        proposal_file -> ava_dense_proposals_{train, val}.FAIR.recall_93.9.pkl
+
+    Particularly, the proposal_file is a pickle file which contains
+    ``img_key`` (in format of ``{video_id},{timestamp}``). Example of a pickle
+    file:
+
+    .. code-block:: JSON
+
+        {
+            ...
+            '0f39OWEqJ24,0902':
+                array([[0.011   , 0.157   , 0.655   , 0.983   , 0.998163]]),
+            '0f39OWEqJ24,0912':
+                array([[0.054   , 0.088   , 0.91    , 0.998   , 0.068273],
+                       [0.016   , 0.161   , 0.519   , 0.974   , 0.984025],
+                       [0.493   , 0.283   , 0.981   , 0.984   , 0.983621]]),
+            ...
+        }
+
+    Args:
+        ann_file (str): Path to the annotation file like
+            ``ava_{train, val}_{v2.1, v2.2}.csv``.
+        exclude_file (str): Path to the excluded timestamp file like
+            ``ava_{train, val}_excluded_timestamps_{v2.1, v2.2}.csv``.
+        pipeline (List[Union[dict, ConfigDict, Callable]]): A sequence of
+            data transforms.
+        label_file (str): Path to the label file like
+            ``ava_action_list_{v2.1, v2.2}.pbtxt`` or
+            ``ava_action_list_{v2.1, v2.2}_for_activitynet_2019.pbtxt``.
+            Defaults to None.
+        filename_tmpl (str): Template for each filename.
+            Defaults to 'img_{:05}.jpg'.
+        start_index (int): Specify a start index for frames in consideration of
+            different filename format. It should be set to 1 for AVA, since
+            frame index start from 1 in AVA dataset. Defaults to 1.
+        proposal_file (str): Path to the proposal file like
+            ``ava_dense_proposals_{train, val}.FAIR.recall_93.9.pkl``.
+            Defaults to None.
+        person_det_score_thr (float): The threshold of person detection scores,
+            bboxes with scores above the threshold will be used.
+            Note that 0 <= person_det_score_thr <= 1. If no proposal has
+            detection score larger than the threshold, the one with the largest
+            detection score will be used. Default: 0.9.
+        num_classes (int): The number of classes of the dataset. Default: 81.
+            (AVA has 80 action classes, another 1-dim is added for potential
+            usage)
+        custom_classes (List[int], optional): A subset of class ids from origin
+            dataset. Please note that 0 should NOT be selected, and
+            ``num_classes`` should be equal to ``len(custom_classes) + 1``.
+        data_prefix (dict or ConfigDict): Path to a directory where video
+            frames are held. Defaults to ``dict(img='')``.
+        test_mode (bool): Store True when building test or validation dataset.
+            Defaults to False.
+        modality (str): Modality of data. Support ``RGB``, ``Flow``.
+            Defaults to ``RGB``.
+        num_max_proposals (int): Max proposals number to store.
+            Defaults to 1000.
+        timestamp_start (int): The start point of included timestamps. The
+            default value is referred from the official website.
+            Defaults to 902.
+        timestamp_end (int): The end point of included timestamps. The default
+            value is referred from the official website. Defaults to 1798.
+        use_frames (bool): Whether to use rawframes as input.
+            Defaults to True.
+        fps (int): Overrides the default FPS for the dataset. If set to 1,
+            means counting timestamp by frame, e.g. MultiSports dataset.
+            Otherwise by second. Defaults to 30.
+        multilabel (bool): Determines whether it is a multilabel recognition
+            task. Defaults to True.
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 pipeline: List[Union[ConfigType, Callable]],
+                 exclude_file: Optional[str] = None,
+                 label_file: Optional[str] = None,
+                 filename_tmpl: str = 'img_{:05}.jpg',
+                 start_index: int = 1,
+                 proposal_file: str = None,
+                 person_det_score_thr: float = 0.9,
+                 num_classes: int = 81,
+                 custom_classes: Optional[List[int]] = None,
+                 data_prefix: ConfigType = dict(img=''),
+                 modality: str = 'RGB',
+                 test_mode: bool = False,
+                 num_max_proposals: int = 1000,
+                 timestamp_start: int = 900,
+                 timestamp_end: int = 1800,
+                 use_frames: bool = True,
+                 fps: int = 30,
+                 multilabel: bool = True,
+                 **kwargs) -> None:
+        self._FPS = fps  # Keep this as standard
+        self.custom_classes = custom_classes
+        if custom_classes is not None:
+            assert num_classes == len(custom_classes) + 1
+            assert 0 not in custom_classes
+            _, class_whitelist = read_labelmap(open(label_file))
+            assert set(custom_classes).issubset(class_whitelist)
+
+            self.custom_classes = list([0] + custom_classes)
+        self.exclude_file = exclude_file
+        self.label_file = label_file
+        self.proposal_file = proposal_file
+        assert 0 <= person_det_score_thr <= 1, (
+            'The value of '
+            'person_det_score_thr should in [0, 1]. ')
+        self.person_det_score_thr = person_det_score_thr
+        self.timestamp_start = timestamp_start
+        self.timestamp_end = timestamp_end
+        self.num_max_proposals = num_max_proposals
+        self.filename_tmpl = filename_tmpl
+        self.use_frames = use_frames
+        self.multilabel = multilabel
+
+        super().__init__(
+            ann_file,
+            pipeline=pipeline,
+            data_prefix=data_prefix,
+            test_mode=test_mode,
+            num_classes=num_classes,
+            start_index=start_index,
+            modality=modality,
+            **kwargs)
+
+        if self.proposal_file is not None:
+            self.proposals = load(self.proposal_file)
+        else:
+            self.proposals = None
+
+    def parse_img_record(self, img_records: List[dict]) -> tuple:
+        """Merge image records of the same entity at the same time.
+
+        Args:
+            img_records (List[dict]): List of img_records (lines in AVA
+                annotations).
+
+        Returns:
+            Tuple(list): A tuple consists of lists of bboxes, action labels and
+                entity_ids.
+        """
+        bboxes, labels, entity_ids = [], [], []
+        while len(img_records) > 0:
+            img_record = img_records[0]
+            num_img_records = len(img_records)
+
+            selected_records = [
+                x for x in img_records
+                if np.array_equal(x['entity_box'], img_record['entity_box'])
+            ]
+
+            num_selected_records = len(selected_records)
+            img_records = [
+                x for x in img_records if
+                not np.array_equal(x['entity_box'], img_record['entity_box'])
+            ]
+
+            assert len(img_records) + num_selected_records == num_img_records
+
+            bboxes.append(img_record['entity_box'])
+            valid_labels = np.array([
+                selected_record['label']
+                for selected_record in selected_records
+            ])
+
+            # The format can be directly used by BCELossWithLogits
+            if self.multilabel:
+                label = np.zeros(self.num_classes, dtype=np.float32)
+                label[valid_labels] = 1.
+            else:
+                label = valid_labels
+
+            labels.append(label)
+            entity_ids.append(img_record['entity_id'])
+        bboxes = np.stack(bboxes)
+        labels = np.stack(labels)
+        entity_ids = np.stack(entity_ids)
+        return bboxes, labels, entity_ids
+
+    def load_data_list(self) -> List[dict]:
+        """Load AVA annotations."""
+        exists(self.ann_file)
+        data_list = []
+        records_dict_by_img = defaultdict(list)
+        fin = list_from_file(self.ann_file)
+        for line in fin:
+            line_split = line.strip().split(',')
+
+            label = int(line_split[6])
+            if self.custom_classes is not None:
+                if label not in self.custom_classes:
+                    continue
+                label = self.custom_classes.index(label)
+
+            video_id = line_split[0]
+            timestamp = int(line_split[1])  # count by second or frame.
+            img_key = f'{video_id},{timestamp:04d}'
+
+            entity_box = np.array(list(map(float, line_split[2:6])))
+            entity_id = int(line_split[7])
+            if self.use_frames:
+                shot_info = (0, (self.timestamp_end - self.timestamp_start) *
+                             self._FPS)
+            # for video data, automatically get shot info when decoding
+            else:
+                shot_info = None
+
+            video_info = dict(
+                video_id=video_id,
+                timestamp=timestamp,
+                entity_box=entity_box,
+                label=label,
+                entity_id=entity_id,
+                shot_info=shot_info)
+            records_dict_by_img[img_key].append(video_info)
+
+        for img_key in records_dict_by_img:
+            video_id, timestamp = img_key.split(',')
+            bboxes, labels, entity_ids = self.parse_img_record(
+                records_dict_by_img[img_key])
+            ann = dict(
+                gt_bboxes=bboxes, gt_labels=labels, entity_ids=entity_ids)
+            frame_dir = video_id
+            if self.data_prefix['img'] is not None:
+                frame_dir = osp.join(self.data_prefix['img'], frame_dir)
+            video_info = dict(
+                frame_dir=frame_dir,
+                video_id=video_id,
+                timestamp=int(timestamp),
+                img_key=img_key,
+                shot_info=shot_info,
+                fps=self._FPS,
+                ann=ann)
+            if not self.use_frames:
+                video_info['filename'] = video_info.pop('frame_dir')
+            data_list.append(video_info)
+
+        return data_list
+
+    def filter_data(self) -> List[dict]:
+        """Filter out records in the exclude_file."""
+        valid_indexes = []
+        if self.exclude_file is None:
+            valid_indexes = list(range(len(self.data_list)))
+        else:
+            exclude_video_infos = [
+                x.strip().split(',') for x in open(self.exclude_file)
+            ]
+            for i, data_info in enumerate(self.data_list):
+                valid_indexes.append(i)
+                for video_id, timestamp in exclude_video_infos:
+                    if (data_info['video_id'] == video_id
+                            and data_info['timestamp'] == int(timestamp)):
+                        valid_indexes.pop()
+                        break
+
+        logger = MMLogger.get_current_instance()
+        logger.info(f'{len(valid_indexes)} out of {len(self.data_list)}'
+                    f' frames are valid.')
+        data_list = [self.data_list[i] for i in valid_indexes]
+
+        return data_list
+
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index."""
+        data_info = super().get_data_info(idx)
+        img_key = data_info['img_key']
+
+        data_info['filename_tmpl'] = self.filename_tmpl
+        data_info['timestamp_start'] = self.timestamp_start
+        data_info['timestamp_end'] = self.timestamp_end
+
+        if self.proposals is not None:
+            if img_key not in self.proposals:
+                data_info['proposals'] = np.array([[0, 0, 1, 1]])
+                data_info['scores'] = np.array([1])
+            else:
+                proposals = self.proposals[img_key]
+                assert proposals.shape[-1] in [4, 5]
+                if proposals.shape[-1] == 5:
+                    thr = min(self.person_det_score_thr, max(proposals[:, 4]))
+                    positive_inds = (proposals[:, 4] >= thr)
+                    proposals = proposals[positive_inds]
+                    proposals = proposals[:self.num_max_proposals]
+                    data_info['proposals'] = proposals[:, :4]
+                    data_info['scores'] = proposals[:, 4]
+                else:
+                    proposals = proposals[:self.num_max_proposals]
+                    data_info['proposals'] = proposals
+
+                assert data_info['proposals'].max() <= 1 and \
+                    data_info['proposals'].min() >= 0, \
+                    (f'relative proposals invalid: max value '
+                     f'{data_info["proposals"].max()}, min value '
+                     f'{data_info["proposals"].min()}')
+
+        ann = data_info.pop('ann')
+        data_info['gt_bboxes'] = ann['gt_bboxes']
+        data_info['gt_labels'] = ann['gt_labels']
+        data_info['entity_ids'] = ann['entity_ids']
+
+        return data_info
+
+
+@DATASETS.register_module()
+class AVAKineticsDataset(BaseActionDataset):
+    """AVA-Kinetics dataset for spatial temporal detection.
+
+    Based on official AVA annotation files, the dataset loads raw frames,
+    bounding boxes, proposals and applies specified transformations to return
+    a dict containing the frame tensors and other information.
+
+    This datasets can load information from the following files:
+
+    .. code-block:: txt
+
+        ann_file -> ava_{train, val}_{v2.1, v2.2}.csv
+        exclude_file -> ava_{train, val}_excluded_timestamps_{v2.1, v2.2}.csv
+        label_file -> ava_action_list_{v2.1, v2.2}.pbtxt /
+                      ava_action_list_{v2.1, v2.2}_for_activitynet_2019.pbtxt
+        proposal_file -> ava_dense_proposals_{train, val}.FAIR.recall_93.9.pkl
+
+    Particularly, the proposal_file is a pickle file which contains
+    ``img_key`` (in format of ``{video_id},{timestamp}``). Example of a pickle
+    file:
+
+    .. code-block:: JSON
+
+        {
+            ...
+            '0f39OWEqJ24,0902':
+                array([[0.011   , 0.157   , 0.655   , 0.983   , 0.998163]]),
+            '0f39OWEqJ24,0912':
+                array([[0.054   , 0.088   , 0.91    , 0.998   , 0.068273],
+                       [0.016   , 0.161   , 0.519   , 0.974   , 0.984025],
+                       [0.493   , 0.283   , 0.981   , 0.984   , 0.983621]]),
+            ...
+        }
+
+    Args:
+        ann_file (str): Path to the annotation file like
+            ``ava_{train, val}_{v2.1, v2.2}.csv``.
+        exclude_file (str): Path to the excluded timestamp file like
+            ``ava_{train, val}_excluded_timestamps_{v2.1, v2.2}.csv``.
+        pipeline (List[Union[dict, ConfigDict, Callable]]): A sequence of
+            data transforms.
+        label_file (str): Path to the label file like
+            ``ava_action_list_{v2.1, v2.2}.pbtxt`` or
+            ``ava_action_list_{v2.1, v2.2}_for_activitynet_2019.pbtxt``.
+            Defaults to None.
+        filename_tmpl (str): Template for each filename.
+            Defaults to 'img_{:05}.jpg'.
+        start_index (int): Specify a start index for frames in consideration of
+            different filename format. However, when taking frames as input,
+            it should be set to 0, since frames from 0. Defaults to 0.
+        proposal_file (str): Path to the proposal file like
+            ``ava_dense_proposals_{train, val}.FAIR.recall_93.9.pkl``.
+            Defaults to None.
+        person_det_score_thr (float): The threshold of person detection scores,
+            bboxes with scores above the threshold will be used.
+            Note that 0 <= person_det_score_thr <= 1. If no proposal has
+            detection score larger than the threshold, the one with the largest
+            detection score will be used. Default: 0.9.
+        num_classes (int): The number of classes of the dataset. Default: 81.
+            (AVA has 80 action classes, another 1-dim is added for potential
+            usage)
+        custom_classes (List[int], optional): A subset of class ids from origin
+            dataset. Please note that 0 should NOT be selected, and
+            ``num_classes`` should be equal to ``len(custom_classes) + 1``.
+        data_prefix (dict or ConfigDict): Path to a directory where video
+            frames are held. Defaults to ``dict(img='')``.
+        test_mode (bool): Store True when building test or validation dataset.
+            Defaults to False.
+        modality (str): Modality of data. Support ``RGB``, ``Flow``.
+            Defaults to ``RGB``.
+        num_max_proposals (int): Max proposals number to store.
+            Defaults to 1000.
+        timestamp_start (int): The start point of included timestamps. The
+            default value is referred from the official website.
+            Defaults to 902.
+        timestamp_end (int): The end point of included timestamps. The default
+            value is referred from the official website. Defaults to 1798.
+        fps (int): Overrides the default FPS for the dataset. Defaults to 30.
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 exclude_file: str,
+                 pipeline: List[Union[ConfigType, Callable]],
+                 label_file: str,
+                 filename_tmpl: str = 'img_{:05}.jpg',
+                 start_index: int = 0,
+                 proposal_file: str = None,
+                 person_det_score_thr: float = 0.9,
+                 num_classes: int = 81,
+                 custom_classes: Optional[List[int]] = None,
+                 data_prefix: ConfigType = dict(img=''),
+                 modality: str = 'RGB',
+                 test_mode: bool = False,
+                 num_max_proposals: int = 1000,
+                 timestamp_start: int = 900,
+                 timestamp_end: int = 1800,
+                 fps: int = 30,
+                 **kwargs) -> None:
+        self._FPS = fps  # Keep this as standard
+        self.custom_classes = custom_classes
+        if custom_classes is not None:
+            assert num_classes == len(custom_classes) + 1
+            assert 0 not in custom_classes
+            _, class_whitelist = read_labelmap(open(label_file))
+            assert set(custom_classes).issubset(class_whitelist)
+
+            self.custom_classes = list([0] + custom_classes)
+        self.exclude_file = exclude_file
+        self.label_file = label_file
+        self.proposal_file = proposal_file
+        assert 0 <= person_det_score_thr <= 1, (
+            'The value of '
+            'person_det_score_thr should in [0, 1]. ')
+        self.person_det_score_thr = person_det_score_thr
+        self.timestamp_start = timestamp_start
+        self.timestamp_end = timestamp_end
+        self.num_max_proposals = num_max_proposals
+        self.filename_tmpl = filename_tmpl
+
+        super().__init__(
+            ann_file,
+            pipeline=pipeline,
+            data_prefix=data_prefix,
+            test_mode=test_mode,
+            num_classes=num_classes,
+            start_index=start_index,
+            modality=modality,
+            **kwargs)
+
+        if self.proposal_file is not None:
+            self.proposals = load(self.proposal_file)
+        else:
+            self.proposals = None
+
+    def parse_img_record(self, img_records: List[dict]) -> tuple:
+        """Merge image records of the same entity at the same time.
+
+        Args:
+            img_records (List[dict]): List of img_records (lines in AVA
+                annotations).
+
+        Returns:
+            Tuple(list): A tuple consists of lists of bboxes, action labels and
+                entity_ids.
+        """
+        bboxes, labels, entity_ids = [], [], []
+        while len(img_records) > 0:
+            img_record = img_records[0]
+            num_img_records = len(img_records)
+
+            selected_records = [
+                x for x in img_records
+                if np.array_equal(x['entity_box'], img_record['entity_box'])
+            ]
+
+            num_selected_records = len(selected_records)
+            img_records = [
+                x for x in img_records if
+                not np.array_equal(x['entity_box'], img_record['entity_box'])
+            ]
+
+            assert len(img_records) + num_selected_records == num_img_records
+
+            bboxes.append(img_record['entity_box'])
+            valid_labels = np.array([
+                selected_record['label']
+                for selected_record in selected_records
+            ])
+
+            # The format can be directly used by BCELossWithLogits
+            label = np.zeros(self.num_classes, dtype=np.float32)
+            label[valid_labels] = 1.
+
+            labels.append(label)
+            entity_ids.append(img_record['entity_id'])
+
+        bboxes = np.stack(bboxes)
+        labels = np.stack(labels)
+        entity_ids = np.stack(entity_ids)
+        return bboxes, labels, entity_ids
+
+    def filter_data(self) -> List[dict]:
+        """Filter out records in the exclude_file."""
+        valid_indexes = []
+        if self.exclude_file is None:
+            valid_indexes = list(range(len(self.data_list)))
+        else:
+            exclude_video_infos = [
+                x.strip().split(',') for x in open(self.exclude_file)
+            ]
+            for i, data_info in enumerate(self.data_list):
+                valid_indexes.append(i)
+                for video_id, timestamp in exclude_video_infos:
+                    if (data_info['video_id'] == video_id
+                            and data_info['timestamp'] == int(timestamp)):
+                        valid_indexes.pop()
+                        break
+
+        logger = MMLogger.get_current_instance()
+        logger.info(f'{len(valid_indexes)} out of {len(self.data_list)}'
+                    f' frames are valid.')
+        data_list = [self.data_list[i] for i in valid_indexes]
+
+        return data_list
+
+    def get_timestamp(self, video_id):
+        if len(video_id) == 11:
+            return self.timestamp_start, self.timestamp_end
+        video_id = video_id.split('_')
+        if len(video_id) >= 3:
+            start = int(video_id[-2])
+            end = int(video_id[-1])
+            video_id = '_'.join(video_id[:-2])
+            return start, end
+        return self.timestamp_start, self.timestamp_end
+
+    def load_data_list(self) -> List[dict]:
+        """Load AVA annotations."""
+        exists(self.ann_file)
+        data_list = []
+        records_dict_by_img = defaultdict(list)
+        fin = list_from_file(self.ann_file)
+        for line in fin:
+            line_split = line.strip().split(',')
+
+            label = int(line_split[6])
+            if self.custom_classes is not None:
+                if label not in self.custom_classes:
+                    continue
+                label = self.custom_classes.index(label)
+
+            video_id = line_split[0]
+            timestamp = int(line_split[1])
+            img_key = f'{video_id},{timestamp:04d}'
+
+            entity_box = np.array(list(map(float, line_split[2:6])))
+            entity_id = int(line_split[7])
+            start, end = self.get_timestamp(video_id)
+            shot_info = (1, (end - start) * self._FPS + 1)
+
+            video_info = dict(
+                video_id=video_id,
+                timestamp=timestamp,
+                entity_box=entity_box,
+                label=label,
+                entity_id=entity_id,
+                shot_info=shot_info)
+            records_dict_by_img[img_key].append(video_info)
+
+        for img_key in records_dict_by_img:
+            video_id, timestamp = img_key.split(',')
+            start, end = self.get_timestamp(video_id)
+            bboxes, labels, entity_ids = self.parse_img_record(
+                records_dict_by_img[img_key])
+            ann = dict(
+                gt_bboxes=bboxes, gt_labels=labels, entity_ids=entity_ids)
+            frame_dir = video_id
+            if self.data_prefix['img'] is not None:
+                frame_dir = osp.join(self.data_prefix['img'], frame_dir)
+            video_info = dict(
+                frame_dir=frame_dir,
+                video_id=video_id,
+                timestamp=int(timestamp),
+                timestamp_start=start,
+                timestamp_end=end,
+                img_key=img_key,
+                shot_info=shot_info,
+                fps=self._FPS,
+                ann=ann)
+            data_list.append(video_info)
+
+        return data_list
+
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index."""
+        data_info = super().get_data_info(idx)
+        img_key = data_info['img_key']
+        data_info['filename_tmpl'] = self.filename_tmpl
+        if 'timestamp_start' not in data_info:
+            data_info['timestamp_start'] = self.timestamp_start
+            data_info['timestamp_end'] = self.timestamp_end
+
+        if self.proposals is not None:
+            if len(img_key) == 16:
+                proposal_key = img_key
+            else:
+                video_id, timestamp = img_key.split(',')
+                vid = '_'.join(video_id.split('_')[:-2])
+                timestamp = int(timestamp)
+                proposal_key = f'{vid},{timestamp:04d}'
+
+            if proposal_key not in self.proposals:
+                data_info['proposals'] = np.array([[0, 0, 1, 1]])
+                data_info['scores'] = np.array([1])
+            else:
+                proposals = self.proposals[proposal_key]
+                assert proposals.shape[-1] in [4, 5]
+                if proposals.shape[-1] == 5:
+                    thr = min(self.person_det_score_thr, max(proposals[:, 4]))
+                    positive_inds = (proposals[:, 4] >= thr)
+                    proposals = proposals[positive_inds]
+                    proposals = proposals[:self.num_max_proposals]
+                    data_info['proposals'] = proposals[:, :4]
+                    data_info['scores'] = proposals[:, 4]
+                else:
+                    proposals = proposals[:self.num_max_proposals]
+                    data_info['proposals'] = proposals
+
+        ann = data_info.pop('ann')
+        data_info['gt_bboxes'] = ann['gt_bboxes']
+        data_info['gt_labels'] = ann['gt_labels']
+        data_info['entity_ids'] = ann['entity_ids']
+
+        return data_info
diff --git a/mmaction/datasets/base.py b/mmaction/datasets/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed485142aee2c2172ed9cf917f9f672cfcb8c19a
--- /dev/null
+++ b/mmaction/datasets/base.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+from typing import Callable, List, Optional, Union
+
+import torch
+from mmengine.dataset import BaseDataset
+
+from mmaction.utils import ConfigType
+
+
+class BaseActionDataset(BaseDataset, metaclass=ABCMeta):
+    """Base class for datasets.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        pipeline (List[Union[dict, ConfigDict, Callable]]): A sequence of
+            data transforms.
+        data_prefix (dict or ConfigDict, optional): Path to a directory where
+            videos are held. Defaults to None.
+        test_mode (bool): Store True when building test or validation dataset.
+            Defaults to False.
+        multi_class (bool): Determines whether the dataset is a multi-class
+            dataset. Defaults to False.
+        num_classes (int, optional): Number of classes of the dataset, used in
+            multi-class datasets. Defaults to None.
+        start_index (int): Specify a start index for frames in consideration of
+            different filename format. However, when taking videos as input,
+            it should be set to 0, since frames loaded from videos count
+            from 0. Defaults to 0.
+        modality (str): Modality of data. Support ``RGB``, ``Flow``, ``Pose``,
+            ``Audio``. Defaults to ``RGB``.
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 pipeline: List[Union[ConfigType, Callable]],
+                 data_prefix: Optional[ConfigType] = dict(prefix=''),
+                 test_mode: bool = False,
+                 multi_class: bool = False,
+                 num_classes: Optional[int] = None,
+                 start_index: int = 0,
+                 modality: str = 'RGB',
+                 **kwargs) -> None:
+        self.multi_class = multi_class
+        self.num_classes = num_classes
+        self.start_index = start_index
+        self.modality = modality
+        super().__init__(
+            ann_file,
+            pipeline=pipeline,
+            data_prefix=data_prefix,
+            test_mode=test_mode,
+            **kwargs)
+
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index."""
+        data_info = super().get_data_info(idx)
+        data_info['modality'] = self.modality
+        data_info['start_index'] = self.start_index
+
+        if self.multi_class:
+            onehot = torch.zeros(self.num_classes)
+            onehot[data_info['label']] = 1.
+            data_info['label'] = onehot
+
+        return data_info
diff --git a/mmaction/datasets/charades_sta_dataset.py b/mmaction/datasets/charades_sta_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e27f986fde31f6b09ee75db61fc0972af54b2d84
--- /dev/null
+++ b/mmaction/datasets/charades_sta_dataset.py
@@ -0,0 +1,124 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Callable, List, Optional, Union
+
+import mmengine
+import numpy as np
+import torch
+from mmengine.fileio import exists
+
+from mmaction.registry import DATASETS
+from mmaction.utils import ConfigType
+from .base import BaseActionDataset
+
+try:
+    import nltk
+    nltk_imported = True
+except ImportError:
+    nltk_imported = False
+
+
+@DATASETS.register_module()
+class CharadesSTADataset(BaseActionDataset):
+
+    def __init__(self,
+                 ann_file: str,
+                 pipeline: List[Union[dict, Callable]],
+                 word2id_file: str,
+                 fps_file: str,
+                 duration_file: str,
+                 num_frames_file: str,
+                 window_size: int,
+                 ft_overlap: float,
+                 data_prefix: Optional[ConfigType] = dict(video=''),
+                 test_mode: bool = False,
+                 **kwargs):
+        if not nltk_imported:
+            raise ImportError('nltk is required for CharadesSTADataset')
+
+        self.fps_info = mmengine.load(fps_file)
+        self.duration_info = mmengine.load(duration_file)
+        self.num_frames = mmengine.load(num_frames_file)
+        self.word2id = mmengine.load(word2id_file)
+        self.ft_interval = int(window_size * (1 - ft_overlap))
+
+        super().__init__(
+            ann_file,
+            pipeline=pipeline,
+            data_prefix=data_prefix,
+            test_mode=test_mode,
+            **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation file to get video information."""
+        exists(self.ann_file)
+        data_list = []
+        with open(self.ann_file) as f:
+            anno_database = f.readlines()
+
+        for item in anno_database:
+            first_part, query_sentence = item.strip().split('##')
+            query_sentence = query_sentence.replace('.', '')
+            query_words = nltk.word_tokenize(query_sentence)
+            query_tokens = [self.word2id[word] for word in query_words]
+            query_length = len(query_tokens)
+            query_tokens = torch.from_numpy(np.array(query_tokens))
+
+            vid_name, start_time, end_time = first_part.split()
+            duration = float(self.duration_info[vid_name])
+            fps = float(self.fps_info[vid_name])
+
+            gt_start_time = float(start_time)
+            gt_end_time = float(end_time)
+
+            gt_bbox = (gt_start_time / duration, min(gt_end_time / duration,
+                                                     1))
+
+            num_frames = int(self.num_frames[vid_name])
+            proposal_frames = self.get_proposals(num_frames)
+
+            proposals = proposal_frames / num_frames
+            proposals = torch.from_numpy(proposals)
+            proposal_indexes = proposal_frames / self.ft_interval
+            proposal_indexes = proposal_indexes.astype(np.int32)
+
+            info = dict(
+                vid_name=vid_name,
+                fps=fps,
+                num_frames=num_frames,
+                duration=duration,
+                query_tokens=query_tokens,
+                query_length=query_length,
+                gt_start_time=gt_start_time,
+                gt_end_time=gt_end_time,
+                gt_bbox=gt_bbox,
+                proposals=proposals,
+                num_proposals=proposals.shape[0],
+                proposal_indexes=proposal_indexes)
+            data_list.append(info)
+        return data_list
+
+    def get_proposals(self, num_frames):
+        proposals = (num_frames - 1) / 32 * np.arange(33)
+        proposals = proposals.astype(np.int32)
+        proposals = np.stack([proposals[:-1], proposals[1:]]).T
+        return proposals
+
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index."""
+        data_info = super().get_data_info(idx)
+        vid_name = data_info['vid_name']
+        feature_path = os.path.join(self.data_prefix['video'],
+                                    f'{vid_name}.pt')
+        vid_feature = torch.load(feature_path)
+        proposal_feats = []
+        proposal_indexes = data_info['proposal_indexes'].clip(
+            max=vid_feature.shape[0] - 1)
+        for s, e in proposal_indexes:
+            prop_feature, _ = vid_feature[s:e + 1].max(dim=0)
+            proposal_feats.append(prop_feature)
+
+        proposal_feats = torch.stack(proposal_feats)
+
+        data_info['raw_feature'] = proposal_feats
+        return data_info
diff --git a/mmaction/datasets/msrvtt_datasets.py b/mmaction/datasets/msrvtt_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..da249de2a4ceb011666a3edd36c612c96975df00
--- /dev/null
+++ b/mmaction/datasets/msrvtt_datasets.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import re
+from collections import Counter
+from typing import Dict, List
+
+from mmengine.fileio import exists
+
+from mmaction.registry import DATASETS
+from .base import BaseActionDataset
+
+
+@DATASETS.register_module()
+class MSRVTTVQA(BaseActionDataset):
+    """MSR-VTT Video Question Answering dataset."""
+
+    def load_data_list(self) -> List[Dict]:
+        """Load annotation file to get video information."""
+        exists(self.ann_file)
+        data_list = []
+
+        with open(self.ann_file) as f:
+            data_lines = json.load(f)
+            for data in data_lines:
+                answers = data['answer']
+                if isinstance(answers, str):
+                    answers = [answers]
+                count = Counter(answers)
+                answer_weight = [i / len(answers) for i in count.values()]
+                data_item = dict(
+                    question_id=data['question_id'],
+                    filename=osp.join(self.data_prefix['video'],
+                                      data['video']),
+                    question=pre_text(data['question']),
+                    gt_answer=list(count.keys()),
+                    gt_answer_weight=answer_weight)
+                data_list.append(data_item)
+
+        return data_list
+
+
+@DATASETS.register_module()
+class MSRVTTVQAMC(BaseActionDataset):
+    """MSR-VTT VQA multiple choices dataset."""
+
+    def load_data_list(self) -> List[Dict]:
+        """Load annotation file to get video information."""
+        exists(self.ann_file)
+        data_list = []
+
+        with open(self.ann_file) as f:
+            data_lines = json.load(f)
+            for data in data_lines:
+                data_item = dict(
+                    filename=osp.join(self.data_prefix['video'],
+                                      data['video']),
+                    label=data['answer'],
+                    caption_options=[pre_text(c) for c in data['caption']])
+                data_list.append(data_item)
+
+        return data_list
+
+
+@DATASETS.register_module()
+class MSRVTTRetrieval(BaseActionDataset):
+    """MSR-VTT Retrieval dataset."""
+
+    def load_data_list(self) -> List[Dict]:
+        """Load annotation file to get video information."""
+        exists(self.ann_file)
+        data_list = []
+
+        with open(self.ann_file) as f:
+            data_lines = json.load(f)
+            video_idx = 0
+            text_idx = 0
+            for data in data_lines:
+                # don't consider multiple videos or multiple captions
+                video_path = osp.join(self.data_prefix['video'], data['video'])
+                data_item = dict(
+                    filename=video_path,
+                    text=[],
+                    gt_video_id=[],
+                    gt_text_id=[])
+                if isinstance(data['caption'], str):
+                    data['caption'] = [data['caption']]
+
+                for text in data['caption']:
+                    text = pre_text(text)
+                    data_item['text'].append(text)
+                    data_item['gt_video_id'].append(video_idx)
+                    data_item['gt_text_id'].append(text_idx)
+                    text_idx += 1
+
+                video_idx += 1
+                data_list.append(data_item)
+        self.num_videos = video_idx
+        self.num_texts = text_idx
+
+        return data_list
+
+
+def pre_text(text, max_l=None):
+    text = re.sub(r"([,.'!?\"()*#:;~])", '', text.lower())
+    text = text.replace('-', ' ').replace('/',
+                                          ' ').replace('<person>', 'person')
+
+    text = re.sub(r'\s{2,}', ' ', text)
+    text = text.rstrip('\n').strip(' ')
+
+    if max_l:  # truncate
+        words = text.split(' ')
+        if len(words) > max_l:
+            text = ' '.join(words[:max_l])
+    return text
diff --git a/mmaction/datasets/pose_dataset.py b/mmaction/datasets/pose_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4862ad12d4775c82693b8c13179c1795e334bbe1
--- /dev/null
+++ b/mmaction/datasets/pose_dataset.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Callable, Dict, List, Optional, Union
+
+import mmengine
+from mmengine.logging import MMLogger
+
+from mmaction.registry import DATASETS
+from .base import BaseActionDataset
+
+
+@DATASETS.register_module()
+class PoseDataset(BaseActionDataset):
+    """Pose dataset for action recognition.
+
+    The dataset loads pose and apply specified transforms to return a
+    dict containing pose information.
+
+    The ann_file is a pickle file, the json file contains a list of
+    annotations, the fields of an annotation include frame_dir(video_id),
+    total_frames, label, kp, kpscore.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        split (str, optional): The dataset split used. For UCF101 and
+            HMDB51, allowed choices are 'train1', 'test1', 'train2',
+            'test2', 'train3', 'test3'. For NTURGB+D, allowed choices
+            are 'xsub_train', 'xsub_val', 'xview_train', 'xview_val'.
+            For NTURGB+D 120, allowed choices are 'xsub_train',
+            'xsub_val', 'xset_train', 'xset_val'. For FineGYM,
+            allowed choices are 'train', 'val'. Defaults to None.
+        valid_ratio (float, optional): The valid_ratio for videos in
+            KineticsPose. For a video with n frames, it is a valid
+            training sample only if n * valid_ratio frames have human
+            pose. None means not applicable (only applicable to Kinetics
+            Pose).Defaults to None.
+        box_thr (float): The threshold for human proposals. Only boxes
+            with confidence score larger than `box_thr` is kept. None
+            means not applicable (only applicable to Kinetics). Allowed
+            choices are 0.5, 0.6, 0.7, 0.8, 0.9. Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 pipeline: List[Union[Dict, Callable]],
+                 split: Optional[str] = None,
+                 valid_ratio: Optional[float] = None,
+                 box_thr: float = 0.5,
+                 **kwargs) -> None:
+        self.split = split
+        self.box_thr = box_thr
+        assert box_thr in [.5, .6, .7, .8, .9]
+        self.valid_ratio = valid_ratio
+
+        super().__init__(
+            ann_file, pipeline=pipeline, modality='Pose', **kwargs)
+
+    def load_data_list(self) -> List[Dict]:
+        """Load annotation file to get skeleton information."""
+        assert self.ann_file.endswith('.pkl')
+        mmengine.exists(self.ann_file)
+        data_list = mmengine.load(self.ann_file)
+
+        if self.split is not None:
+            split, annos = data_list['split'], data_list['annotations']
+            identifier = 'filename' if 'filename' in annos[0] else 'frame_dir'
+            split = set(split[self.split])
+            data_list = [x for x in annos if x[identifier] in split]
+
+        # Sometimes we may need to load video from the file
+        if 'video' in self.data_prefix:
+            for item in data_list:
+                if 'filename' in item:
+                    item['filename'] = osp.join(self.data_prefix['video'],
+                                                item['filename'])
+                if 'frame_dir' in item:
+                    item['frame_dir'] = osp.join(self.data_prefix['video'],
+                                                 item['frame_dir'])
+        return data_list
+
+    def filter_data(self) -> List[Dict]:
+        """Filter out invalid samples."""
+        if self.valid_ratio is not None and isinstance(
+                self.valid_ratio, float) and self.valid_ratio > 0:
+            self.data_list = [
+                x for x in self.data_list if x['valid'][self.box_thr] /
+                x['total_frames'] >= self.valid_ratio
+            ]
+            for item in self.data_list:
+                assert 'box_score' in item,\
+                    'if valid_ratio is a positive number,' \
+                    'item should have field `box_score`'
+                anno_inds = (item['box_score'] >= self.box_thr)
+                item['anno_inds'] = anno_inds
+
+        logger = MMLogger.get_current_instance()
+        logger.info(
+            f'{len(self.data_list)} videos remain after valid thresholding')
+
+        return self.data_list
+
+    def get_data_info(self, idx: int) -> Dict:
+        """Get annotation by index."""
+        data_info = super().get_data_info(idx)
+
+        # Sometimes we may need to load skeleton from the file
+        if 'skeleton' in self.data_prefix:
+            identifier = 'filename' if 'filename' in data_info \
+                else 'frame_dir'
+            ske_name = data_info[identifier]
+            ske_path = osp.join(self.data_prefix['skeleton'],
+                                ske_name + '.pkl')
+            ske = mmengine.load(ske_path)
+            for k in ske:
+                data_info[k] = ske[k]
+
+        return data_info
diff --git a/mmaction/datasets/rawframe_dataset.py b/mmaction/datasets/rawframe_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c333446dfad882041e70149eb9efa5d936837104
--- /dev/null
+++ b/mmaction/datasets/rawframe_dataset.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Callable, List, Optional, Union
+
+from mmengine.fileio import exists, list_from_file
+
+from mmaction.registry import DATASETS
+from mmaction.utils import ConfigType
+from .base import BaseActionDataset
+
+
+@DATASETS.register_module()
+class RawframeDataset(BaseActionDataset):
+    """Rawframe dataset for action recognition.
+
+    The dataset loads raw frames and apply specified transforms to return a
+    dict containing the frame tensors and other information.
+
+    The ann_file is a text file with multiple lines, and each line indicates
+    the directory to frames of a video, total frames of the video and
+    the label of a video, which are split with a whitespace.
+    Example of a annotation file:
+
+    .. code-block:: txt
+
+        some/directory-1 163 1
+        some/directory-2 122 1
+        some/directory-3 258 2
+        some/directory-4 234 2
+        some/directory-5 295 3
+        some/directory-6 121 3
+
+    Example of a multi-class annotation file:
+
+
+    .. code-block:: txt
+
+        some/directory-1 163 1 3 5
+        some/directory-2 122 1 2
+        some/directory-3 258 2
+        some/directory-4 234 2 4 6 8
+        some/directory-5 295 3
+        some/directory-6 121 3
+
+    Example of a with_offset annotation file (clips from long videos), each
+    line indicates the directory to frames of a video, the index of the start
+    frame, total frames of the video clip and the label of a video clip, which
+    are split with a whitespace.
+
+
+    .. code-block:: txt
+
+        some/directory-1 12 163 3
+        some/directory-2 213 122 4
+        some/directory-3 100 258 5
+        some/directory-4 98 234 2
+        some/directory-5 0 295 3
+        some/directory-6 50 121 3
+
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        pipeline (List[Union[dict, ConfigDict, Callable]]): A sequence of
+            data transforms.
+        data_prefix (dict or ConfigDict): Path to a directory where video
+            frames are held. Defaults to ``dict(img='')``.
+        filename_tmpl (str): Template for each filename.
+            Defaults to ``img_{:05}.jpg``.
+        with_offset (bool): Determines whether the offset information is in
+            ann_file. Defaults to False.
+        multi_class (bool): Determines whether it is a multi-class
+            recognition dataset. Defaults to False.
+        num_classes (int, optional): Number of classes in the dataset.
+            Defaults to None.
+        start_index (int): Specify a start index for frames in consideration of
+            different filename format. However, when taking frames as input,
+            it should be set to 1, since raw frames count from 1.
+            Defaults to 1.
+        modality (str): Modality of data. Support ``RGB``, ``Flow``.
+            Defaults to ``RGB``.
+        test_mode (bool): Store True when building test or validation dataset.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 pipeline: List[Union[ConfigType, Callable]],
+                 data_prefix: ConfigType = dict(img=''),
+                 filename_tmpl: str = 'img_{:05}.jpg',
+                 with_offset: bool = False,
+                 multi_class: bool = False,
+                 num_classes: Optional[int] = None,
+                 start_index: int = 1,
+                 modality: str = 'RGB',
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+        self.filename_tmpl = filename_tmpl
+        self.with_offset = with_offset
+        super().__init__(
+            ann_file,
+            pipeline=pipeline,
+            data_prefix=data_prefix,
+            test_mode=test_mode,
+            multi_class=multi_class,
+            num_classes=num_classes,
+            start_index=start_index,
+            modality=modality,
+            **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation file to get video information."""
+        exists(self.ann_file)
+        data_list = []
+        fin = list_from_file(self.ann_file)
+        for line in fin:
+            line_split = line.strip().split()
+            video_info = {}
+            idx = 0
+            # idx for frame_dir
+            frame_dir = line_split[idx]
+            if self.data_prefix['img'] is not None:
+                frame_dir = osp.join(self.data_prefix['img'], frame_dir)
+            video_info['frame_dir'] = frame_dir
+            idx += 1
+            if self.with_offset:
+                # idx for offset and total_frames
+                video_info['offset'] = int(line_split[idx])
+                video_info['total_frames'] = int(line_split[idx + 1])
+                idx += 2
+            else:
+                # idx for total_frames
+                video_info['total_frames'] = int(line_split[idx])
+                idx += 1
+            # idx for label[s]
+            label = [int(x) for x in line_split[idx:]]
+            # add fake label for inference datalist without label
+            if not label:
+                label = [-1]
+            if self.multi_class:
+                assert self.num_classes is not None
+                video_info['label'] = label
+            else:
+                assert len(label) == 1
+                video_info['label'] = label[0]
+            data_list.append(video_info)
+
+        return data_list
+
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index."""
+        data_info = super().get_data_info(idx)
+        data_info['filename_tmpl'] = self.filename_tmpl
+        return data_info
diff --git a/mmaction/datasets/repeat_aug_dataset.py b/mmaction/datasets/repeat_aug_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd1e3a4c0876171a4b859356fabd7358e04c9973
--- /dev/null
+++ b/mmaction/datasets/repeat_aug_dataset.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from typing import Any, Callable, List, Optional, Sequence, Union
+
+import numpy as np
+from mmengine.dataset import COLLATE_FUNCTIONS, pseudo_collate
+
+from mmaction.registry import DATASETS
+from mmaction.utils import ConfigType
+from .video_dataset import VideoDataset
+
+
+def get_type(transform: Union[dict, Callable]) -> str:
+    """get the type of the transform."""
+    if isinstance(transform, dict) and 'type' in transform:
+        return transform['type']
+    elif callable(transform):
+        return transform.__repr__().split('(')[0]
+    else:
+        raise TypeError
+
+
+@DATASETS.register_module()
+class RepeatAugDataset(VideoDataset):
+    """Video dataset for action recognition use repeat augment.
+    https://arxiv.org/pdf/1901.09335.pdf.
+
+    The dataset loads raw videos and apply specified transforms to return a
+    dict containing the frame tensors and other information.
+
+    The ann_file is a text file with multiple lines, and each line indicates
+    a sample video with the filepath and label, which are split with a
+    whitespace. Example of a annotation file:
+
+    .. code-block:: txt
+
+        some/path/000.mp4 1
+        some/path/001.mp4 1
+        some/path/002.mp4 2
+        some/path/003.mp4 2
+        some/path/004.mp4 3
+        some/path/005.mp4 3
+
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        pipeline (List[Union[dict, ConfigDict, Callable]]): A sequence of
+            data transforms.
+        data_prefix (dict or ConfigDict): Path to a directory where videos
+            are held. Defaults to ``dict(video='')``.
+        num_repeats (int): Number of repeat time of one video in a batch.
+            Defaults to 4.
+        sample_once (bool): Determines whether use same frame index for
+            repeat samples. Defaults to False.
+        multi_class (bool): Determines whether the dataset is a multi-class
+            dataset. Defaults to False.
+        num_classes (int, optional): Number of classes of the dataset, used in
+            multi-class datasets. Defaults to None.
+        start_index (int): Specify a start index for frames in consideration of
+            different filename format. However, when taking videos as input,
+            it should be set to 0, since frames loaded from videos count
+            from 0. Defaults to 0.
+        modality (str): Modality of data. Support ``RGB``, ``Flow``.
+            Defaults to ``RGB``.
+        test_mode (bool): Store True when building test or validation dataset.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 pipeline: List[Union[dict, Callable]],
+                 data_prefix: ConfigType = dict(video=''),
+                 num_repeats: int = 4,
+                 sample_once: bool = False,
+                 multi_class: bool = False,
+                 num_classes: Optional[int] = None,
+                 start_index: int = 0,
+                 modality: str = 'RGB',
+                 **kwargs) -> None:
+
+        use_decord = get_type(pipeline[0]) == 'DecordInit' and \
+               get_type(pipeline[2]) == 'DecordDecode'
+
+        assert use_decord, (
+            'RepeatAugDataset requires decord as the video '
+            'loading backend, will support more backends in the '
+            'future')
+
+        super().__init__(
+            ann_file,
+            pipeline=pipeline,
+            data_prefix=data_prefix,
+            multi_class=multi_class,
+            num_classes=num_classes,
+            start_index=start_index,
+            modality=modality,
+            test_mode=False,
+            **kwargs)
+        self.num_repeats = num_repeats
+        self.sample_once = sample_once
+
+    def prepare_data(self, idx) -> List[dict]:
+        """Get data processed by ``self.pipeline``.
+
+        Reduce the video loading and decompressing.
+        Args:
+            idx (int): The index of ``data_info``.
+        Returns:
+            List[dict]: A list of length num_repeats.
+        """
+        transforms = self.pipeline.transforms
+
+        data_info = self.get_data_info(idx)
+        data_info = transforms[0](data_info)  # DecordInit
+
+        frame_inds_list, frame_inds_length = [], [0]
+
+        fake_data_info = dict(
+            total_frames=data_info['total_frames'],
+            start_index=data_info['start_index'])
+
+        if not self.sample_once:
+            for repeat in range(self.num_repeats):
+                data_info_ = transforms[1](fake_data_info)  # SampleFrames
+                frame_inds = data_info_['frame_inds']
+                frame_inds_list.append(frame_inds.reshape(-1))
+                frame_inds_length.append(frame_inds.size +
+                                         frame_inds_length[-1])
+        else:
+            data_info_ = transforms[1](fake_data_info)  # SampleFrames
+            frame_inds = data_info_['frame_inds']
+            for repeat in range(self.num_repeats):
+                frame_inds_list.append(frame_inds.reshape(-1))
+                frame_inds_length.append(frame_inds.size +
+                                         frame_inds_length[-1])
+
+        for key in data_info_:
+            data_info[key] = data_info_[key]
+
+        data_info['frame_inds'] = np.concatenate(frame_inds_list)
+
+        data_info = transforms[2](data_info)  # DecordDecode
+        imgs = data_info.pop('imgs')
+
+        data_info_list = []
+        for repeat in range(self.num_repeats):
+            data_info_ = deepcopy(data_info)
+            start = frame_inds_length[repeat]
+            end = frame_inds_length[repeat + 1]
+            data_info_['imgs'] = imgs[start:end]
+            for transform in transforms[3:]:
+                data_info_ = transform(data_info_)
+            data_info_list.append(data_info_)
+        del imgs
+        return data_info_list
+
+
+@COLLATE_FUNCTIONS.register_module()
+def repeat_pseudo_collate(data_batch: Sequence) -> Any:
+    data_batch = [i for j in data_batch for i in j]
+    return pseudo_collate(data_batch)
diff --git a/mmaction/datasets/transforms/__init__.py b/mmaction/datasets/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3205ca7425d5ff6d41c7776a3b6f3308e353580b
--- /dev/null
+++ b/mmaction/datasets/transforms/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .formatting import (FormatAudioShape, FormatGCNInput, FormatShape,
+                         PackActionInputs, PackLocalizationInputs, Transpose)
+from .loading import (ArrayDecode, AudioFeatureSelector, BuildPseudoClip,
+                      DecordDecode, DecordInit, DenseSampleFrames,
+                      GenerateLocalizationLabels, ImageDecode,
+                      LoadAudioFeature, LoadHVULabel, LoadLocalizationFeature,
+                      LoadProposals, LoadRGBFromFile, OpenCVDecode, OpenCVInit,
+                      PIMSDecode, PIMSInit, PyAVDecode, PyAVDecodeMotionVector,
+                      PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames,
+                      UniformSample, UntrimmedSampleFrames)
+from .pose_transforms import (DecompressPose, GeneratePoseTarget, GenSkeFeat,
+                              JointToBone, MergeSkeFeat, MMCompact, MMDecode,
+                              MMUniformSampleFrames, PadTo, PoseCompact,
+                              PoseDecode, PreNormalize2D, PreNormalize3D,
+                              ToMotion, UniformSampleFrames)
+from .processing import (CenterCrop, ColorJitter, Flip, Fuse, MultiScaleCrop,
+                         RandomCrop, RandomRescale, RandomResizedCrop, Resize,
+                         TenCrop, ThreeCrop)
+from .text_transforms import CLIPTokenize
+from .wrappers import ImgAug, PytorchVideoWrapper, TorchVisionWrapper
+
+__all__ = [
+    'ArrayDecode', 'AudioFeatureSelector', 'BuildPseudoClip', 'CenterCrop',
+    'ColorJitter', 'DecordDecode', 'DecordInit', 'DecordInit',
+    'DenseSampleFrames', 'Flip', 'FormatAudioShape', 'FormatGCNInput',
+    'FormatShape', 'Fuse', 'GenSkeFeat', 'GenerateLocalizationLabels',
+    'GeneratePoseTarget', 'ImageDecode', 'ImgAug', 'JointToBone',
+    'LoadAudioFeature', 'LoadHVULabel', 'DecompressPose',
+    'LoadLocalizationFeature', 'LoadProposals', 'LoadRGBFromFile',
+    'MergeSkeFeat', 'MultiScaleCrop', 'OpenCVDecode', 'OpenCVInit',
+    'OpenCVInit', 'PIMSDecode', 'PIMSInit', 'PackActionInputs',
+    'PackLocalizationInputs', 'PadTo', 'PoseCompact', 'PoseDecode',
+    'PreNormalize2D', 'PreNormalize3D', 'PyAVDecode', 'PyAVDecodeMotionVector',
+    'PyAVInit', 'PyAVInit', 'PytorchVideoWrapper', 'RandomCrop',
+    'RandomRescale', 'RandomResizedCrop', 'RawFrameDecode', 'Resize',
+    'SampleAVAFrames', 'SampleFrames', 'TenCrop', 'ThreeCrop', 'ToMotion',
+    'TorchVisionWrapper', 'Transpose', 'UniformSample', 'UniformSampleFrames',
+    'UntrimmedSampleFrames', 'MMUniformSampleFrames', 'MMDecode', 'MMCompact',
+    'CLIPTokenize'
+]
diff --git a/mmaction/datasets/transforms/formatting.py b/mmaction/datasets/transforms/formatting.py
new file mode 100644
index 0000000000000000000000000000000000000000..7616defe89bb7d6d84cc4a4b6e4631043c8d4335
--- /dev/null
+++ b/mmaction/datasets/transforms/formatting.py
@@ -0,0 +1,451 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Sequence, Tuple
+
+import numpy as np
+import torch
+from mmcv.transforms import BaseTransform, to_tensor
+from mmengine.structures import InstanceData
+
+from mmaction.registry import TRANSFORMS
+from mmaction.structures import ActionDataSample
+
+
+@TRANSFORMS.register_module()
+class PackActionInputs(BaseTransform):
+    """Pack the inputs data.
+
+    Args:
+        collect_keys (tuple[str], optional): The keys to be collected
+            to ``packed_results['inputs']``. Defaults to ``
+        meta_keys (Sequence[str]): The meta keys to saved in the
+            `metainfo` of the `data_sample`.
+            Defaults to ``('img_shape', 'img_key', 'video_id', 'timestamp')``.
+        algorithm_keys (Sequence[str]): The keys of custom elements to be used
+            in the algorithm. Defaults to an empty tuple.
+    """
+
+    mapping_table = {
+        'gt_bboxes': 'bboxes',
+        'gt_labels': 'labels',
+    }
+
+    def __init__(
+            self,
+            collect_keys: Optional[Tuple[str]] = None,
+            meta_keys: Sequence[str] = ('img_shape', 'img_key', 'video_id',
+                                        'timestamp'),
+            algorithm_keys: Sequence[str] = (),
+    ) -> None:
+        self.collect_keys = collect_keys
+        self.meta_keys = meta_keys
+        self.algorithm_keys = algorithm_keys
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`PackActionInputs`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        packed_results = dict()
+        if self.collect_keys is not None:
+            packed_results['inputs'] = dict()
+            for key in self.collect_keys:
+                packed_results['inputs'][key] = to_tensor(results[key])
+        else:
+            if 'imgs' in results:
+                imgs = results['imgs']
+                packed_results['inputs'] = to_tensor(imgs)
+            elif 'heatmap_imgs' in results:
+                heatmap_imgs = results['heatmap_imgs']
+                packed_results['inputs'] = to_tensor(heatmap_imgs)
+            elif 'keypoint' in results:
+                keypoint = results['keypoint']
+                packed_results['inputs'] = to_tensor(keypoint)
+            elif 'audios' in results:
+                audios = results['audios']
+                packed_results['inputs'] = to_tensor(audios)
+            elif 'text' in results:
+                text = results['text']
+                packed_results['inputs'] = to_tensor(text)
+            else:
+                raise ValueError(
+                    'Cannot get `imgs`, `keypoint`, `heatmap_imgs`, '
+                    '`audios` or `text` in the input dict of '
+                    '`PackActionInputs`.')
+
+        data_sample = ActionDataSample()
+
+        if 'gt_bboxes' in results:
+            instance_data = InstanceData()
+            for key in self.mapping_table.keys():
+                instance_data[self.mapping_table[key]] = to_tensor(
+                    results[key])
+            data_sample.gt_instances = instance_data
+
+            if 'proposals' in results:
+                data_sample.proposals = InstanceData(
+                    bboxes=to_tensor(results['proposals']))
+
+        if 'label' in results:
+            data_sample.set_gt_label(results['label'])
+
+        # Set custom algorithm keys
+        for key in self.algorithm_keys:
+            if key in results:
+                data_sample.set_field(results[key], key)
+
+        # Set meta keys
+        img_meta = {k: results[k] for k in self.meta_keys if k in results}
+        data_sample.set_metainfo(img_meta)
+        packed_results['data_samples'] = data_sample
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(collect_keys={self.collect_keys}, '
+        repr_str += f'meta_keys={self.meta_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PackLocalizationInputs(BaseTransform):
+
+    def __init__(self, keys=(), meta_keys=('video_name', )):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def transform(self, results):
+        """Method to pack the input data.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict:
+
+            - 'inputs' (obj:`torch.Tensor`): The forward data of models.
+            - 'data_samples' (obj:`DetDataSample`): The annotation info of the
+                sample.
+        """
+        packed_results = dict()
+        if 'raw_feature' in results:
+            raw_feature = results['raw_feature']
+            packed_results['inputs'] = to_tensor(raw_feature)
+        elif 'bsp_feature' in results:
+            packed_results['inputs'] = torch.tensor(0.)
+        else:
+            raise ValueError(
+                'Cannot get "raw_feature" or "bsp_feature" in the input '
+                'dict of `PackActionInputs`.')
+
+        data_sample = ActionDataSample()
+        for key in self.keys:
+            if key not in results:
+                continue
+            elif key == 'proposals':
+                instance_data = InstanceData()
+                instance_data[key] = to_tensor(results[key])
+                data_sample.proposals = instance_data
+            else:
+                if hasattr(data_sample, 'gt_instances'):
+                    data_sample.gt_instances[key] = to_tensor(results[key])
+                else:
+                    instance_data = InstanceData()
+                    instance_data[key] = to_tensor(results[key])
+                    data_sample.gt_instances = instance_data
+
+        img_meta = {k: results[k] for k in self.meta_keys if k in results}
+        data_sample.set_metainfo(img_meta)
+        packed_results['data_samples'] = data_sample
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Transpose(BaseTransform):
+    """Transpose image channels to a given order.
+
+    Args:
+        keys (Sequence[str]): Required keys to be converted.
+        order (Sequence[int]): Image channel order.
+    """
+
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+
+    def transform(self, results):
+        """Performs the Transpose formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+
+    def __repr__(self):
+        return (f'{self.__class__.__name__}('
+                f'keys={self.keys}, order={self.order})')
+
+
+@TRANSFORMS.register_module()
+class FormatShape(BaseTransform):
+    """Format final imgs shape to the given input_format.
+
+    Required keys:
+
+        - imgs (optional)
+        - heatmap_imgs (optional)
+        - modality (optional)
+        - num_clips
+        - clip_len
+
+    Modified Keys:
+
+        - imgs
+
+    Added Keys:
+
+        - input_shape
+        - heatmap_input_shape (optional)
+
+    Args:
+        input_format (str): Define the final data format.
+        collapse (bool): To collapse input_format N... to ... (NCTHW to CTHW,
+            etc.) if N is 1. Should be set as True when training and testing
+            detectors. Defaults to False.
+    """
+
+    def __init__(self, input_format: str, collapse: bool = False) -> None:
+        self.input_format = input_format
+        self.collapse = collapse
+        if self.input_format not in [
+                'NCTHW', 'NCHW', 'NCTHW_Heatmap', 'NPTCHW'
+        ]:
+            raise ValueError(
+                f'The input format {self.input_format} is invalid.')
+
+    def transform(self, results: Dict) -> Dict:
+        """Performs the FormatShape formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        if not isinstance(results['imgs'], np.ndarray):
+            results['imgs'] = np.array(results['imgs'])
+
+        # [M x H x W x C]
+        # M = 1 * N_crops * N_clips * T
+        if self.collapse:
+            assert results['num_clips'] == 1
+
+        if self.input_format == 'NCTHW':
+            if 'imgs' in results:
+                imgs = results['imgs']
+                num_clips = results['num_clips']
+                clip_len = results['clip_len']
+                if isinstance(clip_len, dict):
+                    clip_len = clip_len['RGB']
+
+                imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+                # N_crops x N_clips x T x H x W x C
+                imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4))
+                # N_crops x N_clips x C x T x H x W
+                imgs = imgs.reshape((-1, ) + imgs.shape[2:])
+                # M' x C x T x H x W
+                # M' = N_crops x N_clips
+                results['imgs'] = imgs
+                results['input_shape'] = imgs.shape
+
+            if 'heatmap_imgs' in results:
+                imgs = results['heatmap_imgs']
+                num_clips = results['num_clips']
+                clip_len = results['clip_len']
+                # clip_len must be a dict
+                clip_len = clip_len['Pose']
+
+                imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+                # N_crops x N_clips x T x C x H x W
+                imgs = np.transpose(imgs, (0, 1, 3, 2, 4, 5))
+                # N_crops x N_clips x C x T x H x W
+                imgs = imgs.reshape((-1, ) + imgs.shape[2:])
+                # M' x C x T x H x W
+                # M' = N_crops x N_clips
+                results['heatmap_imgs'] = imgs
+                results['heatmap_input_shape'] = imgs.shape
+
+        elif self.input_format == 'NCTHW_Heatmap':
+            num_clips = results['num_clips']
+            clip_len = results['clip_len']
+            imgs = results['imgs']
+
+            imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+            # N_crops x N_clips x T x C x H x W
+            imgs = np.transpose(imgs, (0, 1, 3, 2, 4, 5))
+            # N_crops x N_clips x C x T x H x W
+            imgs = imgs.reshape((-1, ) + imgs.shape[2:])
+            # M' x C x T x H x W
+            # M' = N_crops x N_clips
+            results['imgs'] = imgs
+            results['input_shape'] = imgs.shape
+
+        elif self.input_format == 'NCHW':
+            imgs = results['imgs']
+            imgs = np.transpose(imgs, (0, 3, 1, 2))
+            if 'modality' in results and results['modality'] == 'Flow':
+                clip_len = results['clip_len']
+                imgs = imgs.reshape((-1, clip_len * imgs.shape[1]) +
+                                    imgs.shape[2:])
+            # M x C x H x W
+            results['imgs'] = imgs
+            results['input_shape'] = imgs.shape
+
+        elif self.input_format == 'NPTCHW':
+            num_proposals = results['num_proposals']
+            num_clips = results['num_clips']
+            clip_len = results['clip_len']
+            imgs = results['imgs']
+            imgs = imgs.reshape((num_proposals, num_clips * clip_len) +
+                                imgs.shape[1:])
+            # P x M x H x W x C
+            # M = N_clips x T
+            imgs = np.transpose(imgs, (0, 1, 4, 2, 3))
+            # P x M x C x H x W
+            results['imgs'] = imgs
+            results['input_shape'] = imgs.shape
+
+        if self.collapse:
+            assert results['imgs'].shape[0] == 1
+            results['imgs'] = results['imgs'].squeeze(0)
+            results['input_shape'] = results['imgs'].shape
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f"(input_format='{self.input_format}')"
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class FormatAudioShape(BaseTransform):
+    """Format final audio shape to the given input_format.
+
+    Required Keys:
+
+        - audios
+
+    Modified Keys:
+
+        - audios
+
+    Added Keys:
+
+        - input_shape
+
+    Args:
+        input_format (str): Define the final imgs format.
+    """
+
+    def __init__(self, input_format: str) -> None:
+        self.input_format = input_format
+        if self.input_format not in ['NCTF']:
+            raise ValueError(
+                f'The input format {self.input_format} is invalid.')
+
+    def transform(self, results: Dict) -> Dict:
+        """Performs the FormatShape formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        audios = results['audios']
+        # clip x sample x freq -> clip x channel x sample x freq
+        clip, sample, freq = audios.shape
+        audios = audios.reshape(clip, 1, sample, freq)
+        results['audios'] = audios
+        results['input_shape'] = audios.shape
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f"(input_format='{self.input_format}')"
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class FormatGCNInput(BaseTransform):
+    """Format final skeleton shape.
+
+    Required Keys:
+
+        - keypoint
+        - keypoint_score (optional)
+        - num_clips (optional)
+
+    Modified Key:
+
+        - keypoint
+
+    Args:
+        num_person (int): The maximum number of people. Defaults to 2.
+        mode (str): The padding mode. Defaults to ``'zero'``.
+    """
+
+    def __init__(self, num_person: int = 2, mode: str = 'zero') -> None:
+        self.num_person = num_person
+        assert mode in ['zero', 'loop']
+        self.mode = mode
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`FormatGCNInput`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        keypoint = results['keypoint']
+        if 'keypoint_score' in results:
+            keypoint = np.concatenate(
+                (keypoint, results['keypoint_score'][..., None]), axis=-1)
+
+        cur_num_person = keypoint.shape[0]
+        if cur_num_person < self.num_person:
+            pad_dim = self.num_person - cur_num_person
+            pad = np.zeros(
+                (pad_dim, ) + keypoint.shape[1:], dtype=keypoint.dtype)
+            keypoint = np.concatenate((keypoint, pad), axis=0)
+            if self.mode == 'loop' and cur_num_person == 1:
+                for i in range(1, self.num_person):
+                    keypoint[i] = keypoint[0]
+
+        elif cur_num_person > self.num_person:
+            keypoint = keypoint[:self.num_person]
+
+        M, T, V, C = keypoint.shape
+        nc = results.get('num_clips', 1)
+        assert T % nc == 0
+        keypoint = keypoint.reshape(
+            (M, nc, T // nc, V, C)).transpose(1, 0, 2, 3, 4)
+
+        results['keypoint'] = np.ascontiguousarray(keypoint)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'num_person={self.num_person}, '
+                    f'mode={self.mode})')
+        return repr_str
diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bf349bdf5f22f6a99224e05253d6ee855d41ee6
--- /dev/null
+++ b/mmaction/datasets/transforms/loading.py
@@ -0,0 +1,1929 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+import io
+import os
+import os.path as osp
+import shutil
+from typing import Dict, List, Optional, Union
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.transforms import BaseTransform
+from mmengine.fileio import FileClient
+
+from mmaction.registry import TRANSFORMS
+from mmaction.utils import get_random_string, get_shm_dir, get_thread_id
+
+
+@TRANSFORMS.register_module()
+class LoadRGBFromFile(BaseTransform):
+    """Load a RGB image from file.
+
+    Required Keys:
+
+    - img_path
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:``mmcv.imfrombytes``.
+            Defaults to 'color'.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :func:``mmcv.imfrombytes`` for details.
+            Defaults to 'cv2'.
+        io_backend (str): io backend where frames are store.
+            Default: 'disk'.
+        ignore_empty (bool): Whether to allow loading empty image or file path
+            not existent. Defaults to False.
+        kwargs (dict): Args for file client.
+    """
+
+    def __init__(self,
+                 to_float32: bool = False,
+                 color_type: str = 'color',
+                 imdecode_backend: str = 'cv2',
+                 io_backend: str = 'disk',
+                 ignore_empty: bool = False,
+                 **kwargs) -> None:
+        self.ignore_empty = ignore_empty
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.imdecode_backend = imdecode_backend
+        self.file_client = FileClient(io_backend, **kwargs)
+        self.io_backend = io_backend
+
+    def transform(self, results: dict) -> dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+        try:
+            img_bytes = self.file_client.get(filename)
+            img = mmcv.imfrombytes(
+                img_bytes,
+                flag=self.color_type,
+                channel_order='rgb',
+                backend=self.imdecode_backend)
+        except Exception as e:
+            if self.ignore_empty:
+                return None
+            else:
+                raise e
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'ignore_empty={self.ignore_empty}, '
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f"imdecode_backend='{self.imdecode_backend}', "
+                    f"io_backend='{self.io_backend}')")
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadHVULabel(BaseTransform):
+    """Convert the HVU label from dictionaries to torch tensors.
+
+    Required keys are "label", "categories", "category_nums", added or modified
+    keys are "label", "mask" and "category_mask".
+    """
+
+    def __init__(self, **kwargs):
+        self.hvu_initialized = False
+        self.kwargs = kwargs
+
+    def init_hvu_info(self, categories, category_nums):
+        """Initialize hvu information."""
+        assert len(categories) == len(category_nums)
+        self.categories = categories
+        self.category_nums = category_nums
+        self.num_categories = len(self.categories)
+        self.num_tags = sum(self.category_nums)
+        self.category2num = dict(zip(categories, category_nums))
+        self.start_idx = [0]
+        for i in range(self.num_categories - 1):
+            self.start_idx.append(self.start_idx[-1] + self.category_nums[i])
+        self.category2startidx = dict(zip(categories, self.start_idx))
+        self.hvu_initialized = True
+
+    def transform(self, results):
+        """Convert the label dictionary to 3 tensors: "label", "mask" and
+        "category_mask".
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+
+        if not self.hvu_initialized:
+            self.init_hvu_info(results['categories'], results['category_nums'])
+
+        onehot = torch.zeros(self.num_tags)
+        onehot_mask = torch.zeros(self.num_tags)
+        category_mask = torch.zeros(self.num_categories)
+
+        for category, tags in results['label'].items():
+            # skip if not training on this category
+            if category not in self.categories:
+                continue
+            category_mask[self.categories.index(category)] = 1.
+            start_idx = self.category2startidx[category]
+            category_num = self.category2num[category]
+            tags = [idx + start_idx for idx in tags]
+            onehot[tags] = 1.
+            onehot_mask[start_idx:category_num + start_idx] = 1.
+
+        results['label'] = onehot
+        results['mask'] = onehot_mask
+        results['category_mask'] = category_mask
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'hvu_initialized={self.hvu_initialized})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class SampleFrames(BaseTransform):
+    """Sample frames from the video.
+
+    Required Keys:
+
+        - total_frames
+        - start_index
+
+    Added Keys:
+
+        - frame_inds
+        - frame_interval
+        - num_clips
+
+    Args:
+        clip_len (int): Frames of each sampled output clip.
+        frame_interval (int): Temporal interval of adjacent sampled frames.
+            Defaults to 1.
+        num_clips (int): Number of clips to be sampled. Default: 1.
+        temporal_jitter (bool): Whether to apply temporal jittering.
+            Defaults to False.
+        twice_sample (bool): Whether to use twice sample when testing.
+            If set to True, it will sample frames with and without fixed shift,
+            which is commonly used for testing in TSM model. Defaults to False.
+        out_of_bound_opt (str): The way to deal with out of bounds frame
+            indexes. Available options are 'loop', 'repeat_last'.
+            Defaults to 'loop'.
+        test_mode (bool): Store True when building test or validation dataset.
+            Defaults to False.
+        keep_tail_frames (bool): Whether to keep tail frames when sampling.
+            Defaults to False.
+        target_fps (optional, int): Convert input videos with arbitrary frame
+            rates to the unified target FPS before sampling frames. If
+            ``None``, the frame rate will not be adjusted. Defaults to
+            ``None``.
+    """
+
+    def __init__(self,
+                 clip_len: int,
+                 frame_interval: int = 1,
+                 num_clips: int = 1,
+                 temporal_jitter: bool = False,
+                 twice_sample: bool = False,
+                 out_of_bound_opt: str = 'loop',
+                 test_mode: bool = False,
+                 keep_tail_frames: bool = False,
+                 target_fps: Optional[int] = None,
+                 **kwargs) -> None:
+
+        self.clip_len = clip_len
+        self.frame_interval = frame_interval
+        self.num_clips = num_clips
+        self.temporal_jitter = temporal_jitter
+        self.twice_sample = twice_sample
+        self.out_of_bound_opt = out_of_bound_opt
+        self.test_mode = test_mode
+        self.keep_tail_frames = keep_tail_frames
+        self.target_fps = target_fps
+        assert self.out_of_bound_opt in ['loop', 'repeat_last']
+
+    def _get_train_clips(self, num_frames: int,
+                         ori_clip_len: float) -> np.array:
+        """Get clip offsets in train mode.
+
+        It will calculate the average interval for selected frames,
+        and randomly shift them within offsets between [0, avg_interval].
+        If the total number of frames is smaller than clips num or origin
+        frames length, it will return all zero indices.
+
+        Args:
+            num_frames (int): Total number of frame in the video.
+            ori_clip_len (float): length of original sample clip.
+
+        Returns:
+            np.ndarray: Sampled frame indices in train mode.
+        """
+
+        if self.keep_tail_frames:
+            avg_interval = (num_frames - ori_clip_len + 1) / float(
+                self.num_clips)
+            if num_frames > ori_clip_len - 1:
+                base_offsets = np.arange(self.num_clips) * avg_interval
+                clip_offsets = (base_offsets + np.random.uniform(
+                    0, avg_interval, self.num_clips)).astype(np.int32)
+            else:
+                clip_offsets = np.zeros((self.num_clips, ), dtype=np.int32)
+        else:
+            avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips
+
+            if avg_interval > 0:
+                base_offsets = np.arange(self.num_clips) * avg_interval
+                clip_offsets = base_offsets + np.random.randint(
+                    avg_interval, size=self.num_clips)
+            elif num_frames > max(self.num_clips, ori_clip_len):
+                clip_offsets = np.sort(
+                    np.random.randint(
+                        num_frames - ori_clip_len + 1, size=self.num_clips))
+            elif avg_interval == 0:
+                ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips
+                clip_offsets = np.around(np.arange(self.num_clips) * ratio)
+            else:
+                clip_offsets = np.zeros((self.num_clips, ), dtype=np.int32)
+
+        return clip_offsets
+
+    def _get_test_clips(self, num_frames: int,
+                        ori_clip_len: float) -> np.array:
+        """Get clip offsets in test mode.
+
+        If the total number of frames is
+        not enough, it will return all zero indices.
+
+        Args:
+            num_frames (int): Total number of frame in the video.
+            ori_clip_len (float): length of original sample clip.
+
+        Returns:
+            np.ndarray: Sampled frame indices in test mode.
+        """
+        if self.clip_len == 1:  # 2D recognizer
+            # assert self.frame_interval == 1
+            avg_interval = num_frames / float(self.num_clips)
+            base_offsets = np.arange(self.num_clips) * avg_interval
+            clip_offsets = base_offsets + avg_interval / 2.0
+            if self.twice_sample:
+                clip_offsets = np.concatenate([clip_offsets, base_offsets])
+        else:  # 3D recognizer
+            max_offset = max(num_frames - ori_clip_len, 0)
+            if self.twice_sample:
+                num_clips = self.num_clips * 2
+            else:
+                num_clips = self.num_clips
+            if num_clips > 1:
+                num_segments = self.num_clips - 1
+                # align test sample strategy with `PySlowFast` repo
+                if self.target_fps is not None:
+                    offset_between = np.floor(max_offset / float(num_segments))
+                    clip_offsets = np.arange(num_clips) * offset_between
+                else:
+                    offset_between = max_offset / float(num_segments)
+                    clip_offsets = np.arange(num_clips) * offset_between
+                    clip_offsets = np.round(clip_offsets)
+            else:
+                clip_offsets = np.array([max_offset // 2])
+        return clip_offsets
+
+    def _sample_clips(self, num_frames: int, ori_clip_len: float) -> np.array:
+        """Choose clip offsets for the video in a given mode.
+
+        Args:
+            num_frames (int): Total number of frame in the video.
+
+        Returns:
+            np.ndarray: Sampled frame indices.
+        """
+        if self.test_mode:
+            clip_offsets = self._get_test_clips(num_frames, ori_clip_len)
+        else:
+            clip_offsets = self._get_train_clips(num_frames, ori_clip_len)
+
+        return clip_offsets
+
+    def _get_ori_clip_len(self, fps_scale_ratio: float) -> float:
+        """calculate length of clip segment for different strategy.
+
+        Args:
+            fps_scale_ratio (float): Scale ratio to adjust fps.
+        """
+        if self.target_fps is not None:
+            # align test sample strategy with `PySlowFast` repo
+            ori_clip_len = self.clip_len * self.frame_interval
+            ori_clip_len = np.maximum(1, ori_clip_len * fps_scale_ratio)
+        elif self.test_mode:
+            ori_clip_len = (self.clip_len - 1) * self.frame_interval + 1
+        else:
+            ori_clip_len = self.clip_len * self.frame_interval
+
+        return ori_clip_len
+
+    def transform(self, results: dict) -> dict:
+        """Perform the SampleFrames loading.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        total_frames = results['total_frames']
+        # if can't get fps, same value of `fps` and `target_fps`
+        # will perform nothing
+        fps = results.get('avg_fps')
+        if self.target_fps is None or not fps:
+            fps_scale_ratio = 1.0
+        else:
+            fps_scale_ratio = fps / self.target_fps
+        ori_clip_len = self._get_ori_clip_len(fps_scale_ratio)
+        clip_offsets = self._sample_clips(total_frames, ori_clip_len)
+
+        if self.target_fps:
+            frame_inds = clip_offsets[:, None] + np.linspace(
+                0, ori_clip_len - 1, self.clip_len).astype(np.int32)
+        else:
+            frame_inds = clip_offsets[:, None] + np.arange(
+                self.clip_len)[None, :] * self.frame_interval
+            frame_inds = np.concatenate(frame_inds)
+
+        if self.temporal_jitter:
+            perframe_offsets = np.random.randint(
+                self.frame_interval, size=len(frame_inds))
+            frame_inds += perframe_offsets
+
+        frame_inds = frame_inds.reshape((-1, self.clip_len))
+        if self.out_of_bound_opt == 'loop':
+            frame_inds = np.mod(frame_inds, total_frames)
+        elif self.out_of_bound_opt == 'repeat_last':
+            safe_inds = frame_inds < total_frames
+            unsafe_inds = 1 - safe_inds
+            last_ind = np.max(safe_inds * frame_inds, axis=1)
+            new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)
+            frame_inds = new_inds
+        else:
+            raise ValueError('Illegal out_of_bound option.')
+
+        start_index = results['start_index']
+        frame_inds = np.concatenate(frame_inds) + start_index
+        results['frame_inds'] = frame_inds.astype(np.int32)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = self.frame_interval
+        results['num_clips'] = self.num_clips
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'frame_interval={self.frame_interval}, '
+                    f'num_clips={self.num_clips}, '
+                    f'temporal_jitter={self.temporal_jitter}, '
+                    f'twice_sample={self.twice_sample}, '
+                    f'out_of_bound_opt={self.out_of_bound_opt}, '
+                    f'test_mode={self.test_mode})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class UniformSample(BaseTransform):
+    """Uniformly sample frames from the video.
+
+    Modified from https://github.com/facebookresearch/SlowFast/blob/64a
+    bcc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159.
+
+    To sample an n-frame clip from the video. UniformSample basically
+    divides the video into n segments of equal length and randomly samples one
+    frame from each segment.
+
+    Required keys:
+
+        - total_frames
+        - start_index
+
+    Added keys:
+
+        - frame_inds
+        - clip_len
+        - frame_interval
+        - num_clips
+
+    Args:
+        clip_len (int): Frames of each sampled output clip.
+        num_clips (int): Number of clips to be sampled. Defaults to 1.
+        test_mode (bool): Store True when building test or validation dataset.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 clip_len: int,
+                 num_clips: int = 1,
+                 test_mode: bool = False) -> None:
+
+        self.clip_len = clip_len
+        self.num_clips = num_clips
+        self.test_mode = test_mode
+
+    def _get_sample_clips(self, num_frames: int) -> np.ndarray:
+        """To sample an n-frame clip from the video. UniformSample basically
+        divides the video into n segments of equal length and randomly samples
+        one frame from each segment. When the duration of video frames is
+        shorter than the desired length of the target clip, this approach will
+        duplicate the sampled frame instead of looping the sample in "loop"
+        mode. In the test mode, when we need to sample multiple clips,
+        specifically 'n' clips, this method will further divide the segments
+        based on the number of clips to be sampled. The 'i-th' clip will.
+
+        sample the frame located at the position 'i * len(segment) / n'
+        within the segment.
+
+        Args:
+            num_frames (int): Total number of frame in the video.
+
+        Returns:
+            seq (np.ndarray): the indexes of frames of sampled from the video.
+        """
+        seg_size = float(num_frames - 1) / self.clip_len
+        inds = []
+        if not self.test_mode:
+            for i in range(self.clip_len):
+                start = int(np.round(seg_size * i))
+                end = int(np.round(seg_size * (i + 1)))
+                inds.append(np.random.randint(start, end + 1))
+        else:
+            duration = seg_size / (self.num_clips + 1)
+            for k in range(self.num_clips):
+                for i in range(self.clip_len):
+                    start = int(np.round(seg_size * i))
+                    frame_index = start + int(duration * (k + 1))
+                    inds.append(frame_index)
+
+        return np.array(inds)
+
+    def transform(self, results: Dict) -> Dict:
+        """Perform the Uniform Sampling.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        num_frames = results['total_frames']
+
+        inds = self._get_sample_clips(num_frames)
+        start_index = results['start_index']
+        inds = inds + start_index
+
+        results['frame_inds'] = inds.astype(np.int32)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = None
+        results['num_clips'] = self.num_clips
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'num_clips={self.num_clips}, '
+                    f'test_mode={self.test_mode}')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class UntrimmedSampleFrames(BaseTransform):
+    """Sample frames from the untrimmed video.
+
+    Required keys are "filename", "total_frames", added or modified keys are
+    "frame_inds", "clip_interval" and "num_clips".
+
+    Args:
+        clip_len (int): The length of sampled clips. Defaults to  1.
+        clip_interval (int): Clip interval of adjacent center of sampled
+            clips. Defaults to 16.
+        frame_interval (int): Temporal interval of adjacent sampled frames.
+            Defaults to 1.
+    """
+
+    def __init__(self, clip_len=1, clip_interval=16, frame_interval=1):
+        self.clip_len = clip_len
+        self.clip_interval = clip_interval
+        self.frame_interval = frame_interval
+
+    def transform(self, results):
+        """Perform the SampleFrames loading.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        total_frames = results['total_frames']
+        start_index = results['start_index']
+
+        clip_centers = np.arange(self.clip_interval // 2, total_frames,
+                                 self.clip_interval)
+        num_clips = clip_centers.shape[0]
+        frame_inds = clip_centers[:, None] + np.arange(
+            -(self.clip_len // 2 * self.frame_interval),
+            self.frame_interval *
+            (self.clip_len -
+             (self.clip_len // 2)), self.frame_interval)[None, :]
+        # clip frame_inds to legal range
+        frame_inds = np.clip(frame_inds, 0, total_frames - 1)
+
+        frame_inds = np.concatenate(frame_inds) + start_index
+        results['frame_inds'] = frame_inds.astype(np.int32)
+        results['clip_len'] = self.clip_len
+        results['clip_interval'] = self.clip_interval
+        results['frame_interval'] = self.frame_interval
+        results['num_clips'] = num_clips
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'clip_interval={self.clip_interval}, '
+                    f'frame_interval={self.frame_interval})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class DenseSampleFrames(SampleFrames):
+    """Select frames from the video by dense sample strategy.
+
+    Required keys:
+
+    - total_frames
+    - start_index
+
+    Added keys:
+
+    - frame_inds
+    - clip_len
+    - frame_interval
+    - num_clips
+
+    Args:
+        clip_len (int): Frames of each sampled output clip.
+        frame_interval (int): Temporal interval of adjacent sampled frames.
+           Defaults to 1.
+        num_clips (int): Number of clips to be sampled. Defaults to 1.
+        sample_range (int): Total sample range for dense sample.
+            Defaults to 64.
+        num_sample_positions (int): Number of sample start positions, Which is
+            only used in test mode. Defaults to 10. That is to say, by default,
+            there are at least 10 clips for one input sample in test mode.
+        temporal_jitter (bool): Whether to apply temporal jittering.
+            Defaults to False.
+        test_mode (bool): Store True when building test or validation dataset.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 *args,
+                 sample_range: int = 64,
+                 num_sample_positions: int = 10,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.sample_range = sample_range
+        self.num_sample_positions = num_sample_positions
+
+    def _get_train_clips(self, num_frames: int) -> np.array:
+        """Get clip offsets by dense sample strategy in train mode.
+
+        It will calculate a sample position and sample interval and set
+        start index 0 when sample_pos == 1 or randomly choose from
+        [0, sample_pos - 1]. Then it will shift the start index by each
+        base offset.
+
+        Args:
+            num_frames (int): Total number of frame in the video.
+
+        Returns:
+            np.ndarray: Sampled frame indices in train mode.
+        """
+        sample_position = max(1, 1 + num_frames - self.sample_range)
+        interval = self.sample_range // self.num_clips
+        start_idx = 0 if sample_position == 1 else np.random.randint(
+            0, sample_position - 1)
+        base_offsets = np.arange(self.num_clips) * interval
+        clip_offsets = (base_offsets + start_idx) % num_frames
+        return clip_offsets
+
+    def _get_test_clips(self, num_frames: int) -> np.array:
+        """Get clip offsets by dense sample strategy in test mode.
+
+        It will calculate a sample position and sample interval and evenly
+        sample several start indexes as start positions between
+        [0, sample_position-1]. Then it will shift each start index by the
+        base offsets.
+
+        Args:
+            num_frames (int): Total number of frame in the video.
+
+        Returns:
+            np.ndarray: Sampled frame indices in train mode.
+        """
+        sample_position = max(1, 1 + num_frames - self.sample_range)
+        interval = self.sample_range // self.num_clips
+        start_list = np.linspace(
+            0, sample_position - 1, num=self.num_sample_positions, dtype=int)
+        base_offsets = np.arange(self.num_clips) * interval
+        clip_offsets = list()
+        for start_idx in start_list:
+            clip_offsets.extend((base_offsets + start_idx) % num_frames)
+        clip_offsets = np.array(clip_offsets)
+        return clip_offsets
+
+    def _sample_clips(self, num_frames: int) -> np.array:
+        """Choose clip offsets for the video in a given mode.
+
+        Args:
+            num_frames (int): Total number of frame in the video.
+
+        Returns:
+            np.ndarray: Sampled frame indices.
+        """
+        if self.test_mode:
+            clip_offsets = self._get_test_clips(num_frames)
+        else:
+            clip_offsets = self._get_train_clips(num_frames)
+
+        return clip_offsets
+
+    def transform(self, results: dict) -> dict:
+        """Perform the SampleFrames loading.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        total_frames = results['total_frames']
+
+        clip_offsets = self._sample_clips(total_frames)
+        frame_inds = clip_offsets[:, None] + np.arange(
+            self.clip_len)[None, :] * self.frame_interval
+        frame_inds = np.concatenate(frame_inds)
+
+        if self.temporal_jitter:
+            perframe_offsets = np.random.randint(
+                self.frame_interval, size=len(frame_inds))
+            frame_inds += perframe_offsets
+
+        frame_inds = frame_inds.reshape((-1, self.clip_len))
+        if self.out_of_bound_opt == 'loop':
+            frame_inds = np.mod(frame_inds, total_frames)
+        elif self.out_of_bound_opt == 'repeat_last':
+            safe_inds = frame_inds < total_frames
+            unsafe_inds = 1 - safe_inds
+            last_ind = np.max(safe_inds * frame_inds, axis=1)
+            new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)
+            frame_inds = new_inds
+        else:
+            raise ValueError('Illegal out_of_bound option.')
+
+        start_index = results['start_index']
+        frame_inds = np.concatenate(frame_inds) + start_index
+        results['frame_inds'] = frame_inds.astype(np.int32)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = self.frame_interval
+        results['num_clips'] = self.num_clips
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'frame_interval={self.frame_interval}, '
+                    f'num_clips={self.num_clips}, '
+                    f'sample_range={self.sample_range}, '
+                    f'num_sample_positions={self.num_sample_positions}, '
+                    f'temporal_jitter={self.temporal_jitter}, '
+                    f'out_of_bound_opt={self.out_of_bound_opt}, '
+                    f'test_mode={self.test_mode})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class SampleAVAFrames(SampleFrames):
+
+    def __init__(self, clip_len, frame_interval=2, test_mode=False):
+
+        super().__init__(clip_len, frame_interval, test_mode=test_mode)
+
+    def _get_clips(self, center_index, skip_offsets, shot_info):
+        """Get clip offsets."""
+        start = center_index - (self.clip_len // 2) * self.frame_interval
+        end = center_index + ((self.clip_len + 1) // 2) * self.frame_interval
+        frame_inds = list(range(start, end, self.frame_interval))
+        if not self.test_mode:
+            frame_inds = frame_inds + skip_offsets
+        frame_inds = np.clip(frame_inds, shot_info[0], shot_info[1] - 1)
+        return frame_inds
+
+    def transform(self, results):
+        """Perform the SampleFrames loading.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        fps = results['fps']
+        timestamp = results['timestamp']
+        timestamp_start = results['timestamp_start']
+        start_index = results.get('start_index', 0)
+        if results.get('total_frames') is not None:
+            shot_info = (0, results['total_frames'])
+        else:
+            shot_info = results['shot_info']
+
+        center_index = fps * (timestamp - timestamp_start) + start_index
+
+        skip_offsets = np.random.randint(
+            -self.frame_interval // 2, (self.frame_interval + 1) // 2,
+            size=self.clip_len)
+        frame_inds = self._get_clips(center_index, skip_offsets, shot_info)
+
+        frame_inds = np.array(frame_inds, dtype=np.int32) + start_index
+        results['frame_inds'] = frame_inds
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = self.frame_interval
+        results['num_clips'] = 1
+        results['crop_quadruple'] = np.array([0, 0, 1, 1], dtype=np.float32)
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'frame_interval={self.frame_interval}, '
+                    f'test_mode={self.test_mode})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PyAVInit(BaseTransform):
+    """Using pyav to initialize the video.
+
+    PyAV: https://github.com/mikeboers/PyAV
+
+    Required keys are "filename",
+    added or modified keys are "video_reader", and "total_frames".
+
+    Args:
+        io_backend (str): io backend where frames are store.
+            Default: 'disk'.
+        kwargs (dict): Args for file client.
+    """
+
+    def __init__(self, io_backend='disk', **kwargs):
+        self.io_backend = io_backend
+        self.kwargs = kwargs
+        self.file_client = None
+
+    def transform(self, results):
+        """Perform the PyAV initialization.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        try:
+            import av
+        except ImportError:
+            raise ImportError('Please run "conda install av -c conda-forge" '
+                              'or "pip install av" to install PyAV first.')
+
+        if self.file_client is None:
+            self.file_client = FileClient(self.io_backend, **self.kwargs)
+
+        file_obj = io.BytesIO(self.file_client.get(results['filename']))
+        container = av.open(file_obj)
+
+        results['video_reader'] = container
+        results['total_frames'] = container.streams.video[0].frames
+
+        return results
+
+    def __repr__(self):
+        repr_str = f'{self.__class__.__name__}(io_backend={self.io_backend})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PyAVDecode(BaseTransform):
+    """Using PyAV to decode the video.
+
+    PyAV: https://github.com/mikeboers/PyAV
+
+    Required keys are "video_reader" and "frame_inds",
+    added or modified keys are "imgs", "img_shape" and "original_shape".
+
+    Args:
+        multi_thread (bool): If set to True, it will apply multi
+            thread processing. Default: False.
+        mode (str): Decoding mode. Options are 'accurate' and 'efficient'.
+            If set to 'accurate', it will decode videos into accurate frames.
+            If set to 'efficient', it will adopt fast seeking but only return
+            the nearest key frames, which may be duplicated and inaccurate,
+            and more suitable for large scene-based video datasets.
+            Default: 'accurate'.
+    """
+
+    def __init__(self, multi_thread=False, mode='accurate'):
+        self.multi_thread = multi_thread
+        self.mode = mode
+        assert mode in ['accurate', 'efficient']
+
+    @staticmethod
+    def frame_generator(container, stream):
+        """Frame generator for PyAV."""
+        for packet in container.demux(stream):
+            for frame in packet.decode():
+                if frame:
+                    return frame.to_rgb().to_ndarray()
+
+    def transform(self, results):
+        """Perform the PyAV decoding.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        container = results['video_reader']
+        imgs = list()
+
+        if self.multi_thread:
+            container.streams.video[0].thread_type = 'AUTO'
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        if self.mode == 'accurate':
+            # set max indice to make early stop
+            max_inds = max(results['frame_inds'])
+            i = 0
+            for frame in container.decode(video=0):
+                if i > max_inds + 1:
+                    break
+                imgs.append(frame.to_rgb().to_ndarray())
+                i += 1
+
+            # the available frame in pyav may be less than its length,
+            # which may raise error
+            results['imgs'] = [
+                imgs[i % len(imgs)] for i in results['frame_inds']
+            ]
+        elif self.mode == 'efficient':
+            for frame in container.decode(video=0):
+                backup_frame = frame
+                break
+            stream = container.streams.video[0]
+            for idx in results['frame_inds']:
+                pts_scale = stream.average_rate * stream.time_base
+                frame_pts = int(idx / pts_scale)
+                container.seek(
+                    frame_pts, any_frame=False, backward=True, stream=stream)
+                frame = self.frame_generator(container, stream)
+                if frame is not None:
+                    imgs.append(frame)
+                    backup_frame = frame
+                else:
+                    imgs.append(backup_frame)
+            results['imgs'] = imgs
+        results['original_shape'] = imgs[0].shape[:2]
+        results['img_shape'] = imgs[0].shape[:2]
+        results['video_reader'] = None
+        del container
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(multi_thread={self.multi_thread}, mode={self.mode})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PIMSInit(BaseTransform):
+    """Use PIMS to initialize the video.
+
+    PIMS: https://github.com/soft-matter/pims
+
+    Args:
+        io_backend (str): io backend where frames are store.
+            Default: 'disk'.
+        mode (str): Decoding mode. Options are 'accurate' and 'efficient'.
+            If set to 'accurate', it will always use ``pims.PyAVReaderIndexed``
+            to decode videos into accurate frames. If set to 'efficient', it
+            will adopt fast seeking by using ``pims.PyAVReaderTimed``.
+            Both will return the accurate frames in most cases.
+            Default: 'accurate'.
+        kwargs (dict): Args for file client.
+    """
+
+    def __init__(self, io_backend='disk', mode='accurate', **kwargs):
+        self.io_backend = io_backend
+        self.kwargs = kwargs
+        self.file_client = None
+        self.mode = mode
+        assert mode in ['accurate', 'efficient']
+
+    def transform(self, results):
+        """Perform the PIMS initialization.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        try:
+            import pims
+        except ImportError:
+            raise ImportError('Please run "conda install pims -c conda-forge" '
+                              'or "pip install pims" to install pims first.')
+
+        if self.file_client is None:
+            self.file_client = FileClient(self.io_backend, **self.kwargs)
+
+        file_obj = io.BytesIO(self.file_client.get(results['filename']))
+        if self.mode == 'accurate':
+            container = pims.PyAVReaderIndexed(file_obj)
+        else:
+            container = pims.PyAVReaderTimed(file_obj)
+
+        results['video_reader'] = container
+        results['total_frames'] = len(container)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}(io_backend={self.io_backend}, '
+                    f'mode={self.mode})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PIMSDecode(BaseTransform):
+    """Using PIMS to decode the videos.
+
+    PIMS: https://github.com/soft-matter/pims
+
+    Required keys are "video_reader" and "frame_inds",
+    added or modified keys are "imgs", "img_shape" and "original_shape".
+    """
+
+    def transform(self, results):
+        """Perform the PIMS decoding.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+
+        container = results['video_reader']
+
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        frame_inds = results['frame_inds']
+        imgs = [container[idx] for idx in frame_inds]
+
+        results['video_reader'] = None
+        del container
+
+        results['imgs'] = imgs
+        results['original_shape'] = imgs[0].shape[:2]
+        results['img_shape'] = imgs[0].shape[:2]
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class PyAVDecodeMotionVector(PyAVDecode):
+    """Using pyav to decode the motion vectors from video.
+
+    Reference: https://github.com/PyAV-Org/PyAV/
+        blob/main/tests/test_decode.py
+
+    Required keys are "video_reader" and "frame_inds",
+    added or modified keys are "motion_vectors", "frame_inds".
+    """
+
+    @staticmethod
+    def _parse_vectors(mv, vectors, height, width):
+        """Parse the returned vectors."""
+        (w, h, src_x, src_y, dst_x,
+         dst_y) = (vectors['w'], vectors['h'], vectors['src_x'],
+                   vectors['src_y'], vectors['dst_x'], vectors['dst_y'])
+        val_x = dst_x - src_x
+        val_y = dst_y - src_y
+        start_x = dst_x - w // 2
+        start_y = dst_y - h // 2
+        end_x = start_x + w
+        end_y = start_y + h
+        for sx, ex, sy, ey, vx, vy in zip(start_x, end_x, start_y, end_y,
+                                          val_x, val_y):
+            if (sx >= 0 and ex < width and sy >= 0 and ey < height):
+                mv[sy:ey, sx:ex] = (vx, vy)
+
+        return mv
+
+    def transform(self, results):
+        """Perform the PyAV motion vector decoding.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        container = results['video_reader']
+        imgs = list()
+
+        if self.multi_thread:
+            container.streams.video[0].thread_type = 'AUTO'
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        # set max index to make early stop
+        max_idx = max(results['frame_inds'])
+        i = 0
+        stream = container.streams.video[0]
+        codec_context = stream.codec_context
+        codec_context.options = {'flags2': '+export_mvs'}
+        for packet in container.demux(stream):
+            for frame in packet.decode():
+                if i > max_idx + 1:
+                    break
+                i += 1
+                height = frame.height
+                width = frame.width
+                mv = np.zeros((height, width, 2), dtype=np.int8)
+                vectors = frame.side_data.get('MOTION_VECTORS')
+                if frame.key_frame:
+                    # Key frame don't have motion vectors
+                    assert vectors is None
+                if vectors is not None and len(vectors) > 0:
+                    mv = self._parse_vectors(mv, vectors.to_ndarray(), height,
+                                             width)
+                imgs.append(mv)
+
+        results['video_reader'] = None
+        del container
+
+        # the available frame in pyav may be less than its length,
+        # which may raise error
+        results['motion_vectors'] = np.array(
+            [imgs[i % len(imgs)] for i in results['frame_inds']])
+        return results
+
+
+@TRANSFORMS.register_module()
+class DecordInit(BaseTransform):
+    """Using decord to initialize the video_reader.
+
+    Decord: https://github.com/dmlc/decord
+
+    Required Keys:
+
+        - filename
+
+    Added Keys:
+
+        - video_reader
+        - total_frames
+        - fps
+
+    Args:
+        io_backend (str): io backend where frames are store.
+            Defaults to ``'disk'``.
+        num_threads (int): Number of thread to decode the video. Defaults to 1.
+        kwargs (dict): Args for file client.
+    """
+
+    def __init__(self,
+                 io_backend: str = 'disk',
+                 num_threads: int = 1,
+                 **kwargs) -> None:
+        self.io_backend = io_backend
+        self.num_threads = num_threads
+        self.kwargs = kwargs
+        self.file_client = None
+
+    def _get_video_reader(self, filename: str) -> object:
+        if osp.splitext(filename)[0] == filename:
+            filename = filename + '.mp4'
+        try:
+            import decord
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install decord" to install Decord first.')
+
+        if self.file_client is None:
+            self.file_client = FileClient(self.io_backend, **self.kwargs)
+        file_obj = io.BytesIO(self.file_client.get(filename))
+        container = decord.VideoReader(file_obj, num_threads=self.num_threads)
+        return container
+
+    def transform(self, results: Dict) -> Dict:
+        """Perform the Decord initialization.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        container = self._get_video_reader(results['filename'])
+        results['total_frames'] = len(container)
+
+        results['video_reader'] = container
+        results['avg_fps'] = container.get_avg_fps()
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'io_backend={self.io_backend}, '
+                    f'num_threads={self.num_threads})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class DecordDecode(BaseTransform):
+    """Using decord to decode the video.
+
+    Decord: https://github.com/dmlc/decord
+
+    Required Keys:
+
+        - video_reader
+        - frame_inds
+
+    Added Keys:
+
+        - imgs
+        - original_shape
+        - img_shape
+
+    Args:
+        mode (str): Decoding mode. Options are 'accurate' and 'efficient'.
+            If set to 'accurate', it will decode videos into accurate frames.
+            If set to 'efficient', it will adopt fast seeking but only return
+            key frames, which may be duplicated and inaccurate, and more
+            suitable for large scene-based video datasets.
+            Defaults to ``'accurate'``.
+    """
+
+    def __init__(self, mode: str = 'accurate') -> None:
+        self.mode = mode
+        assert mode in ['accurate', 'efficient']
+
+    def _decord_load_frames(self, container: object,
+                            frame_inds: np.ndarray) -> List[np.ndarray]:
+        if self.mode == 'accurate':
+            imgs = container.get_batch(frame_inds).asnumpy()
+            imgs = list(imgs)
+        elif self.mode == 'efficient':
+            # This mode is faster, however it always returns I-FRAME
+            container.seek(0)
+            imgs = list()
+            for idx in frame_inds:
+                container.seek(idx)
+                frame = container.next()
+                imgs.append(frame.asnumpy())
+        return imgs
+
+    def transform(self, results: Dict) -> Dict:
+        """Perform the Decord decoding.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        container = results['video_reader']
+
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        frame_inds = results['frame_inds']
+        imgs = self._decord_load_frames(container, frame_inds)
+
+        results['video_reader'] = None
+        del container
+
+        results['imgs'] = imgs
+        results['original_shape'] = imgs[0].shape[:2]
+        results['img_shape'] = imgs[0].shape[:2]
+
+        # we resize the gt_bboxes and proposals to their real scale
+        if 'gt_bboxes' in results:
+            h, w = results['img_shape']
+            scale_factor = np.array([w, h, w, h])
+            gt_bboxes = results['gt_bboxes']
+            gt_bboxes = (gt_bboxes * scale_factor).astype(np.float32)
+            results['gt_bboxes'] = gt_bboxes
+            if 'proposals' in results and results['proposals'] is not None:
+                proposals = results['proposals']
+                proposals = (proposals * scale_factor).astype(np.float32)
+                results['proposals'] = proposals
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = f'{self.__class__.__name__}(mode={self.mode})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class OpenCVInit(BaseTransform):
+    """Using OpenCV to initialize the video_reader.
+
+    Required keys are ``'filename'``, added or modified keys are `
+    `'new_path'``, ``'video_reader'`` and ``'total_frames'``.
+
+    Args:
+        io_backend (str): io backend where frames are store.
+            Defaults to ``'disk'``.
+    """
+
+    def __init__(self, io_backend: str = 'disk', **kwargs) -> None:
+        self.io_backend = io_backend
+        self.kwargs = kwargs
+        self.file_client = None
+        self.tmp_folder = None
+        if self.io_backend != 'disk':
+            random_string = get_random_string()
+            thread_id = get_thread_id()
+            self.tmp_folder = osp.join(get_shm_dir(),
+                                       f'{random_string}_{thread_id}')
+            os.mkdir(self.tmp_folder)
+
+    def transform(self, results: dict) -> dict:
+        """Perform the OpenCV initialization.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        if self.io_backend == 'disk':
+            new_path = results['filename']
+        else:
+            if self.file_client is None:
+                self.file_client = FileClient(self.io_backend, **self.kwargs)
+
+            thread_id = get_thread_id()
+            # save the file of same thread at the same place
+            new_path = osp.join(self.tmp_folder, f'tmp_{thread_id}.mp4')
+            with open(new_path, 'wb') as f:
+                f.write(self.file_client.get(results['filename']))
+
+        container = mmcv.VideoReader(new_path)
+        results['new_path'] = new_path
+        results['video_reader'] = container
+        results['total_frames'] = len(container)
+
+        return results
+
+    def __del__(self):
+        if self.tmp_folder and osp.exists(self.tmp_folder):
+            shutil.rmtree(self.tmp_folder)
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'io_backend={self.io_backend})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class OpenCVDecode(BaseTransform):
+    """Using OpenCV to decode the video.
+
+    Required keys are ``'video_reader'``, ``'filename'`` and ``'frame_inds'``,
+    added or modified keys are ``'imgs'``, ``'img_shape'`` and
+    ``'original_shape'``.
+    """
+
+    def transform(self, results: dict) -> dict:
+        """Perform the OpenCV decoding.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        container = results['video_reader']
+        imgs = list()
+
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        for frame_ind in results['frame_inds']:
+            cur_frame = container[frame_ind]
+            # last frame may be None in OpenCV
+            while isinstance(cur_frame, type(None)):
+                frame_ind -= 1
+                cur_frame = container[frame_ind]
+            imgs.append(cur_frame)
+
+        results['video_reader'] = None
+        del container
+
+        imgs = np.array(imgs)
+        # The default channel order of OpenCV is BGR, thus we change it to RGB
+        imgs = imgs[:, :, :, ::-1]
+        results['imgs'] = list(imgs)
+        results['original_shape'] = imgs[0].shape[:2]
+        results['img_shape'] = imgs[0].shape[:2]
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class RawFrameDecode(BaseTransform):
+    """Load and decode frames with given indices.
+
+    Required Keys:
+
+    - frame_dir
+    - filename_tmpl
+    - frame_inds
+    - modality
+    - offset (optional)
+
+    Added Keys:
+
+    - img
+    - img_shape
+    - original_shape
+
+    Args:
+        io_backend (str): IO backend where frames are stored.
+            Defaults to ``'disk'``.
+        decoding_backend (str): Backend used for image decoding.
+            Defaults to ``'cv2'``.
+    """
+
+    def __init__(self,
+                 io_backend: str = 'disk',
+                 decoding_backend: str = 'cv2',
+                 **kwargs) -> None:
+        self.io_backend = io_backend
+        self.decoding_backend = decoding_backend
+        self.kwargs = kwargs
+        self.file_client = None
+
+    def transform(self, results: dict) -> dict:
+        """Perform the ``RawFrameDecode`` to pick frames given indices.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        mmcv.use_backend(self.decoding_backend)
+
+        directory = results['frame_dir']
+        filename_tmpl = results['filename_tmpl']
+        modality = results['modality']
+
+        if self.file_client is None:
+            self.file_client = FileClient(self.io_backend, **self.kwargs)
+
+        imgs = list()
+
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        offset = results.get('offset', 0)
+
+        cache = {}
+        for i, frame_idx in enumerate(results['frame_inds']):
+            # Avoid loading duplicated frames
+            if frame_idx in cache:
+                imgs.append(cp.deepcopy(imgs[cache[frame_idx]]))
+                continue
+            else:
+                cache[frame_idx] = i
+
+            frame_idx += offset
+            if modality == 'RGB':
+                filepath = osp.join(directory, filename_tmpl.format(frame_idx))
+                img_bytes = self.file_client.get(filepath)
+                # Get frame with channel order RGB directly.
+                cur_frame = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+                imgs.append(cur_frame)
+            elif modality == 'Flow':
+                x_filepath = osp.join(directory,
+                                      filename_tmpl.format('x', frame_idx))
+                y_filepath = osp.join(directory,
+                                      filename_tmpl.format('y', frame_idx))
+                x_img_bytes = self.file_client.get(x_filepath)
+                x_frame = mmcv.imfrombytes(x_img_bytes, flag='grayscale')
+                y_img_bytes = self.file_client.get(y_filepath)
+                y_frame = mmcv.imfrombytes(y_img_bytes, flag='grayscale')
+                imgs.append(np.stack([x_frame, y_frame], axis=-1))
+            else:
+                raise NotImplementedError
+
+        results['imgs'] = imgs
+        results['original_shape'] = imgs[0].shape[:2]
+        results['img_shape'] = imgs[0].shape[:2]
+
+        # we resize the gt_bboxes and proposals to their real scale
+        if 'gt_bboxes' in results:
+            h, w = results['img_shape']
+            scale_factor = np.array([w, h, w, h])
+            gt_bboxes = results['gt_bboxes']
+            gt_bboxes = (gt_bboxes * scale_factor).astype(np.float32)
+            results['gt_bboxes'] = gt_bboxes
+            if 'proposals' in results and results['proposals'] is not None:
+                proposals = results['proposals']
+                proposals = (proposals * scale_factor).astype(np.float32)
+                results['proposals'] = proposals
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'io_backend={self.io_backend}, '
+                    f'decoding_backend={self.decoding_backend})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class InferencerPackInput(BaseTransform):
+
+    def __init__(self,
+                 input_format='video',
+                 filename_tmpl='img_{:05}.jpg',
+                 modality='RGB',
+                 start_index=1) -> None:
+        self.input_format = input_format
+        self.filename_tmpl = filename_tmpl
+        self.modality = modality
+        self.start_index = start_index
+
+    def transform(self, video: Union[str, np.ndarray, dict]) -> dict:
+        if self.input_format == 'dict':
+            results = video
+        elif self.input_format == 'video':
+            results = dict(
+                filename=video, label=-1, start_index=0, modality='RGB')
+        elif self.input_format == 'rawframes':
+            import re
+
+            # count the number of frames that match the format of
+            # `filename_tmpl`
+            # RGB pattern example: img_{:05}.jpg -> ^img_\d+.jpg$
+            # Flow patteren example: {}_{:05d}.jpg -> ^x_\d+.jpg$
+            pattern = f'^{self.filename_tmpl}$'
+            if self.modality == 'Flow':
+                pattern = pattern.replace('{}', 'x')
+            pattern = pattern.replace(
+                pattern[pattern.find('{'):pattern.find('}') + 1], '\\d+')
+            total_frames = len(
+                list(
+                    filter(lambda x: re.match(pattern, x) is not None,
+                           os.listdir(video))))
+            results = dict(
+                frame_dir=video,
+                total_frames=total_frames,
+                label=-1,
+                start_index=self.start_index,
+                filename_tmpl=self.filename_tmpl,
+                modality=self.modality)
+        elif self.input_format == 'array':
+            modality_map = {2: 'Flow', 3: 'RGB'}
+            modality = modality_map.get(video.shape[-1])
+            results = dict(
+                total_frames=video.shape[0],
+                label=-1,
+                start_index=0,
+                array=video,
+                modality=modality)
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class ArrayDecode(BaseTransform):
+    """Load and decode frames with given indices from a 4D array.
+
+    Required keys are "array and "frame_inds", added or modified keys are
+    "imgs", "img_shape" and "original_shape".
+    """
+
+    def transform(self, results):
+        """Perform the ``RawFrameDecode`` to pick frames given indices.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+
+        modality = results['modality']
+        array = results['array']
+
+        imgs = list()
+
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        offset = results.get('offset', 0)
+
+        for i, frame_idx in enumerate(results['frame_inds']):
+
+            frame_idx += offset
+            if modality == 'RGB':
+                imgs.append(array[frame_idx])
+            elif modality == 'Flow':
+                imgs.extend(
+                    [array[frame_idx, ..., 0], array[frame_idx, ..., 1]])
+            else:
+                raise NotImplementedError
+
+        results['imgs'] = imgs
+        results['original_shape'] = imgs[0].shape[:2]
+        results['img_shape'] = imgs[0].shape[:2]
+
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}()'
+
+
+@TRANSFORMS.register_module()
+class ImageDecode(BaseTransform):
+    """Load and decode images.
+
+    Required key is "filename", added or modified keys are "imgs", "img_shape"
+    and "original_shape".
+
+    Args:
+        io_backend (str): IO backend where frames are stored. Default: 'disk'.
+        decoding_backend (str): Backend used for image decoding.
+            Default: 'cv2'.
+        kwargs (dict, optional): Arguments for FileClient.
+    """
+
+    def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs):
+        self.io_backend = io_backend
+        self.decoding_backend = decoding_backend
+        self.kwargs = kwargs
+        self.file_client = None
+
+    def transform(self, results):
+        """Perform the ``ImageDecode`` to load image given the file path.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        mmcv.use_backend(self.decoding_backend)
+
+        filename = results['filename']
+
+        if self.file_client is None:
+            self.file_client = FileClient(self.io_backend, **self.kwargs)
+
+        imgs = list()
+        img_bytes = self.file_client.get(filename)
+
+        img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+        imgs.append(img)
+
+        results['imgs'] = imgs
+        results['original_shape'] = imgs[0].shape[:2]
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadAudioFeature(BaseTransform):
+    """Load offline extracted audio features.
+
+    Required Keys:
+
+        - audio_path
+
+    Added Keys:
+
+        - length
+        - audios
+
+    Args:
+        pad_method (str): Padding method. Defaults to ``'zero'``.
+    """
+
+    def __init__(self, pad_method: str = 'zero') -> None:
+        if pad_method not in ['zero', 'random']:
+            raise NotImplementedError
+        self.pad_method = pad_method
+
+    @staticmethod
+    def _zero_pad(shape: int) -> np.ndarray:
+        """Zero padding method."""
+        return np.zeros(shape, dtype=np.float32)
+
+    @staticmethod
+    def _random_pad(shape: int) -> np.ndarray:
+        """Random padding method."""
+        # spectrogram is normalized into a distribution of 0~1
+        return np.random.rand(shape).astype(np.float32)
+
+    def transform(self, results: Dict) -> Dict:
+        """Perform the numpy loading.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        if osp.exists(results['audio_path']):
+            feature_map = np.load(results['audio_path'])
+        else:
+            # Generate a random dummy 10s input
+            # Some videos do not have audio stream
+            pad_func = getattr(self, f'_{self.pad_method}_pad')
+            feature_map = pad_func((640, 80))
+
+        results['length'] = feature_map.shape[0]
+        results['audios'] = feature_map
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'pad_method={self.pad_method})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BuildPseudoClip(BaseTransform):
+    """Build pseudo clips with one single image by repeating it n times.
+
+    Required key is "imgs", added or modified key is "imgs", "num_clips",
+        "clip_len".
+
+    Args:
+        clip_len (int): Frames of the generated pseudo clips.
+    """
+
+    def __init__(self, clip_len):
+        self.clip_len = clip_len
+
+    def transform(self, results):
+        """Perform the building of pseudo clips.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        # the input should be one single image
+        assert len(results['imgs']) == 1
+        im = results['imgs'][0]
+        for _ in range(1, self.clip_len):
+            results['imgs'].append(np.copy(im))
+        results['clip_len'] = self.clip_len
+        results['num_clips'] = 1
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'fix_length={self.fixed_length})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class AudioFeatureSelector(BaseTransform):
+    """Sample the audio feature w.r.t. the frames selected.
+
+    Required Keys:
+
+        - audios
+        - frame_inds
+        - num_clips
+        - length
+        - total_frames
+
+    Modified Keys:
+
+        - audios
+
+    Added Keys:
+
+        - audios_shape
+
+    Args:
+        fixed_length (int): As the features selected by frames sampled may
+            not be exactly the same, `fixed_length` will truncate or pad them
+            into the same size. Defaults to 128.
+    """
+
+    def __init__(self, fixed_length: int = 128) -> None:
+        self.fixed_length = fixed_length
+
+    def transform(self, results: Dict) -> Dict:
+        """Perform the ``AudioFeatureSelector`` to pick audio feature clips.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        audio = results['audios']
+        frame_inds = results['frame_inds']
+        num_clips = results['num_clips']
+        resampled_clips = list()
+
+        frame_inds = frame_inds.reshape(num_clips, -1)
+        for clip_idx in range(num_clips):
+            clip_frame_inds = frame_inds[clip_idx]
+            start_idx = max(
+                0,
+                int(
+                    round((clip_frame_inds[0] + 1) / results['total_frames'] *
+                          results['length'])))
+            end_idx = min(
+                results['length'],
+                int(
+                    round((clip_frame_inds[-1] + 1) / results['total_frames'] *
+                          results['length'])))
+            cropped_audio = audio[start_idx:end_idx, :]
+            if cropped_audio.shape[0] >= self.fixed_length:
+                truncated_audio = cropped_audio[:self.fixed_length, :]
+            else:
+                truncated_audio = np.pad(
+                    cropped_audio,
+                    ((0, self.fixed_length - cropped_audio.shape[0]), (0, 0)),
+                    mode='constant')
+
+            resampled_clips.append(truncated_audio)
+        results['audios'] = np.array(resampled_clips)
+        results['audios_shape'] = results['audios'].shape
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'fix_length={self.fixed_length})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadLocalizationFeature(BaseTransform):
+    """Load Video features for localizer with given video_name list.
+
+    The required key is "feature_path", added or modified keys
+    are "raw_feature".
+
+    Args:
+        raw_feature_ext (str): Raw feature file extension.  Default: '.csv'.
+    """
+
+    def transform(self, results):
+        """Perform the LoadLocalizationFeature loading.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        data_path = results['feature_path']
+        raw_feature = np.loadtxt(
+            data_path, dtype=np.float32, delimiter=',', skiprows=1)
+
+        results['raw_feature'] = np.transpose(raw_feature, (1, 0))
+
+        return results
+
+    def __repr__(self):
+        repr_str = f'{self.__class__.__name__}'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class GenerateLocalizationLabels(BaseTransform):
+    """Load video label for localizer with given video_name list.
+
+    Required keys are "duration_frame", "duration_second", "feature_frame",
+    "annotations", added or modified keys are "gt_bbox".
+    """
+
+    def transform(self, results):
+        """Perform the GenerateLocalizationLabels loading.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        video_frame = results['duration_frame']
+        video_second = results['duration_second']
+        feature_frame = results['feature_frame']
+        corrected_second = float(feature_frame) / video_frame * video_second
+        annotations = results['annotations']
+
+        gt_bbox = []
+
+        for annotation in annotations:
+            current_start = max(
+                min(1, annotation['segment'][0] / corrected_second), 0)
+            current_end = max(
+                min(1, annotation['segment'][1] / corrected_second), 0)
+            gt_bbox.append([current_start, current_end])
+
+        gt_bbox = np.array(gt_bbox)
+        results['gt_bbox'] = gt_bbox
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadProposals(BaseTransform):
+    """Loading proposals with given proposal results.
+
+    Required keys are "video_name", added or modified keys are 'bsp_feature',
+    'tmin', 'tmax', 'tmin_score', 'tmax_score' and 'reference_temporal_iou'.
+
+    Args:
+        top_k (int): The top k proposals to be loaded.
+        pgm_proposals_dir (str): Directory to load proposals.
+        pgm_features_dir (str): Directory to load proposal features.
+        proposal_ext (str): Proposal file extension. Default: '.csv'.
+        feature_ext (str): Feature file extension. Default: '.npy'.
+    """
+
+    def __init__(self,
+                 top_k,
+                 pgm_proposals_dir,
+                 pgm_features_dir,
+                 proposal_ext='.csv',
+                 feature_ext='.npy'):
+        self.top_k = top_k
+        self.pgm_proposals_dir = pgm_proposals_dir
+        self.pgm_features_dir = pgm_features_dir
+        valid_proposal_ext = ('.csv', )
+        if proposal_ext not in valid_proposal_ext:
+            raise NotImplementedError
+        self.proposal_ext = proposal_ext
+        valid_feature_ext = ('.npy', )
+        if feature_ext not in valid_feature_ext:
+            raise NotImplementedError
+        self.feature_ext = feature_ext
+
+    def transform(self, results):
+        """Perform the LoadProposals loading.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        video_name = results['video_name']
+        proposal_path = osp.join(self.pgm_proposals_dir,
+                                 video_name + self.proposal_ext)
+        if self.proposal_ext == '.csv':
+            pgm_proposals = np.loadtxt(
+                proposal_path, dtype=np.float32, delimiter=',', skiprows=1)
+
+        pgm_proposals = np.array(pgm_proposals[:self.top_k])
+        tmin = pgm_proposals[:, 0]
+        tmax = pgm_proposals[:, 1]
+        tmin_score = pgm_proposals[:, 2]
+        tmax_score = pgm_proposals[:, 3]
+        reference_temporal_iou = pgm_proposals[:, 5]
+
+        feature_path = osp.join(self.pgm_features_dir,
+                                video_name + self.feature_ext)
+        if self.feature_ext == '.npy':
+            bsp_feature = np.load(feature_path).astype(np.float32)
+
+        bsp_feature = bsp_feature[:self.top_k, :]
+        results['bsp_feature'] = bsp_feature
+        results['tmin'] = tmin
+        results['tmax'] = tmax
+        results['tmin_score'] = tmin_score
+        results['tmax_score'] = tmax_score
+        results['reference_temporal_iou'] = reference_temporal_iou
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'top_k={self.top_k}, '
+                    f'pgm_proposals_dir={self.pgm_proposals_dir}, '
+                    f'pgm_features_dir={self.pgm_features_dir}, '
+                    f'proposal_ext={self.proposal_ext}, '
+                    f'feature_ext={self.feature_ext})')
+        return repr_str
diff --git a/mmaction/datasets/transforms/pose_transforms.py b/mmaction/datasets/transforms/pose_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..25c4ed636fdbe64f638f0096bc648774990b5e6e
--- /dev/null
+++ b/mmaction/datasets/transforms/pose_transforms.py
@@ -0,0 +1,1523 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import scipy
+from mmcv.transforms import BaseTransform, KeyMapper
+from mmengine.dataset import Compose
+from packaging import version as pv
+from scipy.stats import mode
+from torch.nn.modules.utils import _pair
+
+from mmaction.registry import TRANSFORMS
+from .loading import DecordDecode, DecordInit
+from .processing import _combine_quadruple
+
+if pv.parse(scipy.__version__) < pv.parse('1.11.0'):
+    get_mode = mode
+else:
+    from functools import partial
+    get_mode = partial(mode, keepdims=True)
+
+
+@TRANSFORMS.register_module()
+class DecompressPose(BaseTransform):
+    """Load Compressed Pose.
+
+    Required Keys:
+
+        - frame_inds
+        - total_frames
+        - keypoint
+        - anno_inds (optional)
+
+    Modified Keys:
+
+        - keypoint
+        - frame_inds
+
+    Added Keys:
+
+        - keypoint_score
+        - num_person
+
+    Args:
+        squeeze (bool): Whether to remove frames with no human pose.
+            Defaults to True.
+        max_person (int): The max number of persons in a frame. Defaults to 10.
+    """
+
+    def __init__(self, squeeze: bool = True, max_person: int = 10) -> None:
+        self.squeeze = squeeze
+        self.max_person = max_person
+
+    def transform(self, results: Dict) -> Dict:
+        """Perform the pose decoding.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        required_keys = ['total_frames', 'frame_inds', 'keypoint']
+        for k in required_keys:
+            assert k in results
+
+        total_frames = results['total_frames']
+        frame_inds = results.pop('frame_inds')
+        keypoint = results['keypoint']
+
+        if 'anno_inds' in results:
+            frame_inds = frame_inds[results['anno_inds']]
+            keypoint = keypoint[results['anno_inds']]
+
+        assert np.all(np.diff(frame_inds) >= 0), \
+            'frame_inds should be monotonical increasing'
+
+        def mapinds(inds):
+            uni = np.unique(inds)
+            map_ = {x: i for i, x in enumerate(uni)}
+            inds = [map_[x] for x in inds]
+            return np.array(inds, dtype=np.int16)
+
+        if self.squeeze:
+            frame_inds = mapinds(frame_inds)
+            total_frames = np.max(frame_inds) + 1
+
+        results['total_frames'] = total_frames
+
+        num_joints = keypoint.shape[1]
+        num_person = get_mode(frame_inds)[-1][0]
+
+        new_kp = np.zeros([num_person, total_frames, num_joints, 2],
+                          dtype=np.float16)
+        new_kpscore = np.zeros([num_person, total_frames, num_joints],
+                               dtype=np.float16)
+        nperson_per_frame = np.zeros([total_frames], dtype=np.int16)
+
+        for frame_ind, kp in zip(frame_inds, keypoint):
+            person_ind = nperson_per_frame[frame_ind]
+            new_kp[person_ind, frame_ind] = kp[:, :2]
+            new_kpscore[person_ind, frame_ind] = kp[:, 2]
+            nperson_per_frame[frame_ind] += 1
+
+        if num_person > self.max_person:
+            for i in range(total_frames):
+                nperson = nperson_per_frame[i]
+                val = new_kpscore[:nperson, i]
+                score_sum = val.sum(-1)
+
+                inds = sorted(range(nperson), key=lambda x: -score_sum[x])
+                new_kpscore[:nperson, i] = new_kpscore[inds, i]
+                new_kp[:nperson, i] = new_kp[inds, i]
+            num_person = self.max_person
+            results['num_person'] = num_person
+
+        results['keypoint'] = new_kp[:num_person]
+        results['keypoint_score'] = new_kpscore[:num_person]
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'squeeze={self.squeeze}, '
+                    f'max_person={self.max_person})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class GeneratePoseTarget(BaseTransform):
+    """Generate pseudo heatmaps based on joint coordinates and confidence.
+
+    Required Keys:
+
+        - keypoint
+        - keypoint_score (optional)
+        - img_shape
+
+    Added Keys:
+
+        - imgs (optional)
+        - heatmap_imgs (optional)
+
+    Args:
+        sigma (float): The sigma of the generated gaussian map.
+            Defaults to 0.6.
+        use_score (bool): Use the confidence score of keypoints as the maximum
+            of the gaussian maps. Defaults to True.
+        with_kp (bool): Generate pseudo heatmaps for keypoints.
+            Defaults to True.
+        with_limb (bool): Generate pseudo heatmaps for limbs. At least one of
+            'with_kp' and 'with_limb' should be True. Defaults to False.
+        skeletons (tuple[tuple]): The definition of human skeletons.
+            Defaults to ``((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7),
+                         (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13),
+                         (13, 15), (6, 12), (12, 14), (14, 16), (11, 12))``,
+            which is the definition of COCO-17p skeletons.
+        double (bool): Output both original heatmaps and flipped heatmaps.
+            Defaults to False.
+        left_kp (tuple[int]): Indexes of left keypoints, which is used when
+            flipping heatmaps. Defaults to (1, 3, 5, 7, 9, 11, 13, 15),
+            which is left keypoints in COCO-17p.
+        right_kp (tuple[int]): Indexes of right keypoints, which is used when
+            flipping heatmaps. Defaults to (2, 4, 6, 8, 10, 12, 14, 16),
+            which is right keypoints in COCO-17p.
+        left_limb (tuple[int]): Indexes of left limbs, which is used when
+            flipping heatmaps. Defaults to (0, 2, 4, 5, 6, 10, 11, 12),
+            which is left limbs of skeletons we defined for COCO-17p.
+        right_limb (tuple[int]): Indexes of right limbs, which is used when
+            flipping heatmaps. Defaults to (1, 3, 7, 8, 9, 13, 14, 15),
+            which is right limbs of skeletons we defined for COCO-17p.
+        scaling (float): The ratio to scale the heatmaps. Defaults to 1.
+    """
+
+    def __init__(self,
+                 sigma: float = 0.6,
+                 use_score: bool = True,
+                 with_kp: bool = True,
+                 with_limb: bool = False,
+                 skeletons: Tuple[Tuple[int]] = ((0, 1), (0, 2), (1, 3),
+                                                 (2, 4), (0, 5), (5, 7),
+                                                 (7, 9), (0, 6), (6, 8),
+                                                 (8, 10), (5, 11), (11, 13),
+                                                 (13, 15), (6, 12), (12, 14),
+                                                 (14, 16), (11, 12)),
+                 double: bool = False,
+                 left_kp: Tuple[int] = (1, 3, 5, 7, 9, 11, 13, 15),
+                 right_kp: Tuple[int] = (2, 4, 6, 8, 10, 12, 14, 16),
+                 left_limb: Tuple[int] = (0, 2, 4, 5, 6, 10, 11, 12),
+                 right_limb: Tuple[int] = (1, 3, 7, 8, 9, 13, 14, 15),
+                 scaling: float = 1.) -> None:
+
+        self.sigma = sigma
+        self.use_score = use_score
+        self.with_kp = with_kp
+        self.with_limb = with_limb
+        self.double = double
+
+        # an auxiliary const
+        self.eps = 1e-4
+
+        assert self.with_kp or self.with_limb, (
+            'At least one of "with_limb" '
+            'and "with_kp" should be set as True.')
+        self.left_kp = left_kp
+        self.right_kp = right_kp
+        self.skeletons = skeletons
+        self.left_limb = left_limb
+        self.right_limb = right_limb
+        self.scaling = scaling
+
+    def generate_a_heatmap(self, arr: np.ndarray, centers: np.ndarray,
+                           max_values: np.ndarray) -> None:
+        """Generate pseudo heatmap for one keypoint in one frame.
+
+        Args:
+            arr (np.ndarray): The array to store the generated heatmaps.
+                Shape: img_h * img_w.
+            centers (np.ndarray): The coordinates of corresponding keypoints
+                (of multiple persons). Shape: M * 2.
+            max_values (np.ndarray): The max values of each keypoint. Shape: M.
+        """
+
+        sigma = self.sigma
+        img_h, img_w = arr.shape
+
+        for center, max_value in zip(centers, max_values):
+            if max_value < self.eps:
+                continue
+
+            mu_x, mu_y = center[0], center[1]
+            st_x = max(int(mu_x - 3 * sigma), 0)
+            ed_x = min(int(mu_x + 3 * sigma) + 1, img_w)
+            st_y = max(int(mu_y - 3 * sigma), 0)
+            ed_y = min(int(mu_y + 3 * sigma) + 1, img_h)
+            x = np.arange(st_x, ed_x, 1, np.float32)
+            y = np.arange(st_y, ed_y, 1, np.float32)
+
+            # if the keypoint not in the heatmap coordinate system
+            if not (len(x) and len(y)):
+                continue
+            y = y[:, None]
+
+            patch = np.exp(-((x - mu_x)**2 + (y - mu_y)**2) / 2 / sigma**2)
+            patch = patch * max_value
+            arr[st_y:ed_y, st_x:ed_x] = \
+                np.maximum(arr[st_y:ed_y, st_x:ed_x], patch)
+
+    def generate_a_limb_heatmap(self, arr: np.ndarray, starts: np.ndarray,
+                                ends: np.ndarray, start_values: np.ndarray,
+                                end_values: np.ndarray) -> None:
+        """Generate pseudo heatmap for one limb in one frame.
+
+        Args:
+            arr (np.ndarray): The array to store the generated heatmaps.
+                Shape: img_h * img_w.
+            starts (np.ndarray): The coordinates of one keypoint in the
+                corresponding limbs. Shape: M * 2.
+            ends (np.ndarray): The coordinates of the other keypoint in the
+                corresponding limbs. Shape: M * 2.
+            start_values (np.ndarray): The max values of one keypoint in the
+                corresponding limbs. Shape: M.
+            end_values (np.ndarray): The max values of the other keypoint
+                in the corresponding limbs. Shape: M.
+        """
+
+        sigma = self.sigma
+        img_h, img_w = arr.shape
+
+        for start, end, start_value, end_value in zip(starts, ends,
+                                                      start_values,
+                                                      end_values):
+            value_coeff = min(start_value, end_value)
+            if value_coeff < self.eps:
+                continue
+
+            min_x, max_x = min(start[0], end[0]), max(start[0], end[0])
+            min_y, max_y = min(start[1], end[1]), max(start[1], end[1])
+
+            min_x = max(int(min_x - 3 * sigma), 0)
+            max_x = min(int(max_x + 3 * sigma) + 1, img_w)
+            min_y = max(int(min_y - 3 * sigma), 0)
+            max_y = min(int(max_y + 3 * sigma) + 1, img_h)
+
+            x = np.arange(min_x, max_x, 1, np.float32)
+            y = np.arange(min_y, max_y, 1, np.float32)
+
+            if not (len(x) and len(y)):
+                continue
+
+            y = y[:, None]
+            x_0 = np.zeros_like(x)
+            y_0 = np.zeros_like(y)
+
+            # distance to start keypoints
+            d2_start = ((x - start[0])**2 + (y - start[1])**2)
+
+            # distance to end keypoints
+            d2_end = ((x - end[0])**2 + (y - end[1])**2)
+
+            # the distance between start and end keypoints.
+            d2_ab = ((start[0] - end[0])**2 + (start[1] - end[1])**2)
+
+            if d2_ab < 1:
+                self.generate_a_heatmap(arr, start[None], start_value[None])
+                continue
+
+            coeff = (d2_start - d2_end + d2_ab) / 2. / d2_ab
+
+            a_dominate = coeff <= 0
+            b_dominate = coeff >= 1
+            seg_dominate = 1 - a_dominate - b_dominate
+
+            position = np.stack([x + y_0, y + x_0], axis=-1)
+            projection = start + np.stack([coeff, coeff], axis=-1) * (
+                end - start)
+            d2_line = position - projection
+            d2_line = d2_line[:, :, 0]**2 + d2_line[:, :, 1]**2
+            d2_seg = (
+                a_dominate * d2_start + b_dominate * d2_end +
+                seg_dominate * d2_line)
+
+            patch = np.exp(-d2_seg / 2. / sigma**2)
+            patch = patch * value_coeff
+
+            arr[min_y:max_y, min_x:max_x] = \
+                np.maximum(arr[min_y:max_y, min_x:max_x], patch)
+
+    def generate_heatmap(self, arr: np.ndarray, kps: np.ndarray,
+                         max_values: np.ndarray) -> None:
+        """Generate pseudo heatmap for all keypoints and limbs in one frame (if
+        needed).
+
+        Args:
+            arr (np.ndarray): The array to store the generated heatmaps.
+                Shape: V * img_h * img_w.
+            kps (np.ndarray): The coordinates of keypoints in this frame.
+                Shape: M * V * 2.
+            max_values (np.ndarray): The confidence score of each keypoint.
+                Shape: M * V.
+        """
+
+        if self.with_kp:
+            num_kp = kps.shape[1]
+            for i in range(num_kp):
+                self.generate_a_heatmap(arr[i], kps[:, i], max_values[:, i])
+
+        if self.with_limb:
+            for i, limb in enumerate(self.skeletons):
+                start_idx, end_idx = limb
+                starts = kps[:, start_idx]
+                ends = kps[:, end_idx]
+
+                start_values = max_values[:, start_idx]
+                end_values = max_values[:, end_idx]
+                self.generate_a_limb_heatmap(arr[i], starts, ends,
+                                             start_values, end_values)
+
+    def gen_an_aug(self, results: Dict) -> np.ndarray:
+        """Generate pseudo heatmaps for all frames.
+
+        Args:
+            results (dict): The dictionary that contains all info of a sample.
+
+        Returns:
+            np.ndarray: The generated pseudo heatmaps.
+        """
+
+        all_kps = results['keypoint'].astype(np.float32)
+        kp_shape = all_kps.shape
+
+        if 'keypoint_score' in results:
+            all_kpscores = results['keypoint_score']
+        else:
+            all_kpscores = np.ones(kp_shape[:-1], dtype=np.float32)
+
+        img_h, img_w = results['img_shape']
+
+        # scale img_h, img_w and kps
+        img_h = int(img_h * self.scaling + 0.5)
+        img_w = int(img_w * self.scaling + 0.5)
+        all_kps[..., :2] *= self.scaling
+
+        num_frame = kp_shape[1]
+        num_c = 0
+        if self.with_kp:
+            num_c += all_kps.shape[2]
+        if self.with_limb:
+            num_c += len(self.skeletons)
+
+        ret = np.zeros([num_frame, num_c, img_h, img_w], dtype=np.float32)
+
+        for i in range(num_frame):
+            # M, V, C
+            kps = all_kps[:, i]
+            # M, C
+            kpscores = all_kpscores[:, i] if self.use_score else \
+                np.ones_like(all_kpscores[:, i])
+
+            self.generate_heatmap(ret[i], kps, kpscores)
+        return ret
+
+    def transform(self, results: Dict) -> Dict:
+        """Generate pseudo heatmaps based on joint coordinates and confidence.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        heatmap = self.gen_an_aug(results)
+        key = 'heatmap_imgs' if 'imgs' in results else 'imgs'
+
+        if self.double:
+            indices = np.arange(heatmap.shape[1], dtype=np.int64)
+            left, right = (self.left_kp, self.right_kp) if self.with_kp else (
+                self.left_limb, self.right_limb)
+            for l, r in zip(left, right):  # noqa: E741
+                indices[l] = r
+                indices[r] = l
+            heatmap_flip = heatmap[..., ::-1][:, indices]
+            heatmap = np.concatenate([heatmap, heatmap_flip])
+        results[key] = heatmap
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'sigma={self.sigma}, '
+                    f'use_score={self.use_score}, '
+                    f'with_kp={self.with_kp}, '
+                    f'with_limb={self.with_limb}, '
+                    f'skeletons={self.skeletons}, '
+                    f'double={self.double}, '
+                    f'left_kp={self.left_kp}, '
+                    f'right_kp={self.right_kp}, '
+                    f'left_limb={self.left_limb}, '
+                    f'right_limb={self.right_limb}, '
+                    f'scaling={self.scaling})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PoseCompact(BaseTransform):
+    """Convert the coordinates of keypoints to make it more compact.
+    Specifically, it first find a tight bounding box that surrounds all joints
+    in each frame, then we expand the tight box by a given padding ratio. For
+    example, if 'padding == 0.25', then the expanded box has unchanged center,
+    and 1.25x width and height.
+
+    Required Keys:
+
+        - keypoint
+        - img_shape
+
+    Modified Keys:
+
+        - img_shape
+        - keypoint
+
+    Added Keys:
+
+        - crop_quadruple
+
+    Args:
+        padding (float): The padding size. Defaults to 0.25.
+        threshold (int): The threshold for the tight bounding box. If the width
+            or height of the tight bounding box is smaller than the threshold,
+            we do not perform the compact operation. Defaults to 10.
+        hw_ratio (float | tuple[float] | None): The hw_ratio of the expanded
+            box. Float indicates the specific ratio and tuple indicates a
+            ratio range. If set as None, it means there is no requirement on
+            hw_ratio. Defaults to None.
+        allow_imgpad (bool): Whether to allow expanding the box outside the
+            image to meet the hw_ratio requirement. Defaults to True.
+    """
+
+    def __init__(self,
+                 padding: float = 0.25,
+                 threshold: int = 10,
+                 hw_ratio: Optional[Union[float, Tuple[float]]] = None,
+                 allow_imgpad: bool = True) -> None:
+
+        self.padding = padding
+        self.threshold = threshold
+        if hw_ratio is not None:
+            hw_ratio = _pair(hw_ratio)
+
+        self.hw_ratio = hw_ratio
+
+        self.allow_imgpad = allow_imgpad
+        assert self.padding >= 0
+
+    def transform(self, results: Dict) -> Dict:
+        """Convert the coordinates of keypoints to make it more compact.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        img_shape = results['img_shape']
+        h, w = img_shape
+        kp = results['keypoint']
+
+        # Make NaN zero
+        kp[np.isnan(kp)] = 0.
+        kp_x = kp[..., 0]
+        kp_y = kp[..., 1]
+
+        min_x = np.min(kp_x[kp_x != 0], initial=np.Inf)
+        min_y = np.min(kp_y[kp_y != 0], initial=np.Inf)
+        max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf)
+        max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf)
+
+        # The compact area is too small
+        if max_x - min_x < self.threshold or max_y - min_y < self.threshold:
+            return results
+
+        center = ((max_x + min_x) / 2, (max_y + min_y) / 2)
+        half_width = (max_x - min_x) / 2 * (1 + self.padding)
+        half_height = (max_y - min_y) / 2 * (1 + self.padding)
+
+        if self.hw_ratio is not None:
+            half_height = max(self.hw_ratio[0] * half_width, half_height)
+            half_width = max(1 / self.hw_ratio[1] * half_height, half_width)
+
+        min_x, max_x = center[0] - half_width, center[0] + half_width
+        min_y, max_y = center[1] - half_height, center[1] + half_height
+
+        # hot update
+        if not self.allow_imgpad:
+            min_x, min_y = int(max(0, min_x)), int(max(0, min_y))
+            max_x, max_y = int(min(w, max_x)), int(min(h, max_y))
+        else:
+            min_x, min_y = int(min_x), int(min_y)
+            max_x, max_y = int(max_x), int(max_y)
+
+        kp_x[kp_x != 0] -= min_x
+        kp_y[kp_y != 0] -= min_y
+
+        new_shape = (max_y - min_y, max_x - min_x)
+        results['img_shape'] = new_shape
+
+        # the order is x, y, w, h (in [0, 1]), a tuple
+        crop_quadruple = results.get('crop_quadruple', (0., 0., 1., 1.))
+        new_crop_quadruple = (min_x / w, min_y / h, (max_x - min_x) / w,
+                              (max_y - min_y) / h)
+        crop_quadruple = _combine_quadruple(crop_quadruple, new_crop_quadruple)
+        results['crop_quadruple'] = crop_quadruple
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}(padding={self.padding}, '
+                    f'threshold={self.threshold}, '
+                    f'hw_ratio={self.hw_ratio}, '
+                    f'allow_imgpad={self.allow_imgpad})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PreNormalize3D(BaseTransform):
+    """PreNormalize for NTURGB+D 3D keypoints (x, y, z).
+
+    PreNormalize3D first subtracts the coordinates of each joint
+    from the coordinates of the 'spine' (joint #1 in ntu) of the first person
+    in the first frame. Subsequently, it performs a 3D rotation to fix the Z
+    axis parallel to the 3D vector from the 'hip' (joint #0) and the 'spine'
+    (joint #1) and the X axis toward the 3D vector from the 'right shoulder'
+    (joint #8) and the 'left shoulder' (joint #4). Codes adapted from
+    https://github.com/lshiwjx/2s-AGCN.
+
+    Required Keys:
+
+        - keypoint
+        - total_frames (optional)
+
+    Modified Keys:
+
+        - keypoint
+
+    Added Keys:
+
+        - body_center
+
+    Args:
+        zaxis (list[int]): The target Z axis for the 3D rotation.
+            Defaults to ``[0, 1]``.
+        xaxis (list[int]): The target X axis for the 3D rotation.
+            Defaults to ``[8, 4]``.
+        align_spine (bool): Whether to perform a 3D rotation to
+            align the spine. Defaults to True.
+        align_shoulder (bool): Whether to perform a 3D rotation
+            to align the shoulder. Defaults to True.
+        align_center (bool): Whether to align the body center.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 zaxis: List[int] = [0, 1],
+                 xaxis: List[int] = [8, 4],
+                 align_spine: bool = True,
+                 align_shoulder: bool = True,
+                 align_center: bool = True) -> None:
+        self.zaxis = zaxis
+        self.xaxis = xaxis
+        self.align_center = align_center
+        self.align_spine = align_spine
+        self.align_shoulder = align_shoulder
+
+    def unit_vector(self, vector: np.ndarray) -> np.ndarray:
+        """Returns the unit vector of the vector."""
+        return vector / np.linalg.norm(vector)
+
+    def angle_between(self, v1: np.ndarray, v2: np.ndarray) -> float:
+        """Returns the angle in radians between vectors 'v1' and 'v2'."""
+        if np.abs(v1).sum() < 1e-6 or np.abs(v2).sum() < 1e-6:
+            return 0
+        v1_u = self.unit_vector(v1)
+        v2_u = self.unit_vector(v2)
+        return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))
+
+    def rotation_matrix(self, axis: np.ndarray, theta: float) -> np.ndarray:
+        """Returns the rotation matrix associated with counterclockwise
+        rotation about the given axis by theta radians."""
+        if np.abs(axis).sum() < 1e-6 or np.abs(theta) < 1e-6:
+            return np.eye(3)
+        axis = np.asarray(axis)
+        axis = axis / np.sqrt(np.dot(axis, axis))
+        a = np.cos(theta / 2.0)
+        b, c, d = -axis * np.sin(theta / 2.0)
+        aa, bb, cc, dd = a * a, b * b, c * c, d * d
+        bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d
+        return np.array([[aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)],
+                         [2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)],
+                         [2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc]])
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`PreNormalize3D`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        skeleton = results['keypoint']
+        total_frames = results.get('total_frames', skeleton.shape[1])
+
+        M, T, V, C = skeleton.shape
+        assert T == total_frames
+        if skeleton.sum() == 0:
+            return results
+
+        index0 = [
+            i for i in range(T) if not np.all(np.isclose(skeleton[0, i], 0))
+        ]
+
+        assert M in [1, 2]
+        if M == 2:
+            index1 = [
+                i for i in range(T)
+                if not np.all(np.isclose(skeleton[1, i], 0))
+            ]
+            if len(index0) < len(index1):
+                skeleton = skeleton[:, np.array(index1)]
+                skeleton = skeleton[[1, 0]]
+            else:
+                skeleton = skeleton[:, np.array(index0)]
+        else:
+            skeleton = skeleton[:, np.array(index0)]
+
+        T_new = skeleton.shape[1]
+
+        if self.align_center:
+            if skeleton.shape[2] == 25:
+                main_body_center = skeleton[0, 0, 1].copy()
+            else:
+                main_body_center = skeleton[0, 0, -1].copy()
+            mask = ((skeleton != 0).sum(-1) > 0)[..., None]
+            skeleton = (skeleton - main_body_center) * mask
+
+        if self.align_spine:
+            joint_bottom = skeleton[0, 0, self.zaxis[0]]
+            joint_top = skeleton[0, 0, self.zaxis[1]]
+            axis = np.cross(joint_top - joint_bottom, [0, 0, 1])
+            angle = self.angle_between(joint_top - joint_bottom, [0, 0, 1])
+            matrix_z = self.rotation_matrix(axis, angle)
+            skeleton = np.einsum('abcd,kd->abck', skeleton, matrix_z)
+
+        if self.align_shoulder:
+            joint_rshoulder = skeleton[0, 0, self.xaxis[0]]
+            joint_lshoulder = skeleton[0, 0, self.xaxis[1]]
+            axis = np.cross(joint_rshoulder - joint_lshoulder, [1, 0, 0])
+            angle = self.angle_between(joint_rshoulder - joint_lshoulder,
+                                       [1, 0, 0])
+            matrix_x = self.rotation_matrix(axis, angle)
+            skeleton = np.einsum('abcd,kd->abck', skeleton, matrix_x)
+
+        results['keypoint'] = skeleton
+        results['total_frames'] = T_new
+        results['body_center'] = main_body_center
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'zaxis={self.zaxis}, '
+                    f'xaxis={self.xaxis}, '
+                    f'align_center={self.align_center}, '
+                    f'align_spine={self.align_spine}, '
+                    f'align_shoulder={self.align_shoulder})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PreNormalize2D(BaseTransform):
+    """Normalize the range of keypoint values.
+
+    Required Keys:
+
+        - keypoint
+        - img_shape (optional)
+
+    Modified Keys:
+
+        - keypoint
+
+    Args:
+        img_shape (tuple[int, int]): The resolution of the original video.
+            Defaults to ``(1080, 1920)``.
+    """
+
+    def __init__(self, img_shape: Tuple[int, int] = (1080, 1920)) -> None:
+        self.img_shape = img_shape
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`PreNormalize2D`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        h, w = results.get('img_shape', self.img_shape)
+        results['keypoint'][..., 0] = \
+            (results['keypoint'][..., 0] - (w / 2)) / (w / 2)
+        results['keypoint'][..., 1] = \
+            (results['keypoint'][..., 1] - (h / 2)) / (h / 2)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'img_shape={self.img_shape})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class JointToBone(BaseTransform):
+    """Convert the joint information to bone information.
+
+    Required Keys:
+
+        - keypoint
+
+    Modified Keys:
+
+        - keypoint
+
+    Args:
+        dataset (str): Define the type of dataset: 'nturgb+d', 'openpose',
+            'coco'. Defaults to ``'nturgb+d'``.
+        target (str): The target key for the bone information.
+            Defaults to ``'keypoint'``.
+    """
+
+    def __init__(self,
+                 dataset: str = 'nturgb+d',
+                 target: str = 'keypoint') -> None:
+        self.dataset = dataset
+        self.target = target
+        if self.dataset not in ['nturgb+d', 'openpose', 'coco']:
+            raise ValueError(
+                f'The dataset type {self.dataset} is not supported')
+        if self.dataset == 'nturgb+d':
+            self.pairs = [(0, 1), (1, 20), (2, 20), (3, 2), (4, 20), (5, 4),
+                          (6, 5), (7, 6), (8, 20), (9, 8), (10, 9), (11, 10),
+                          (12, 0), (13, 12), (14, 13), (15, 14), (16, 0),
+                          (17, 16), (18, 17), (19, 18), (21, 22), (20, 20),
+                          (22, 7), (23, 24), (24, 11)]
+        elif self.dataset == 'openpose':
+            self.pairs = ((0, 0), (1, 0), (2, 1), (3, 2), (4, 3), (5, 1),
+                          (6, 5), (7, 6), (8, 2), (9, 8), (10, 9), (11, 5),
+                          (12, 11), (13, 12), (14, 0), (15, 0), (16, 14), (17,
+                                                                           15))
+        elif self.dataset == 'coco':
+            self.pairs = ((0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 0),
+                          (6, 0), (7, 5), (8, 6), (9, 7), (10, 8), (11, 0),
+                          (12, 0), (13, 11), (14, 12), (15, 13), (16, 14))
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`JointToBone`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        keypoint = results['keypoint']
+        M, T, V, C = keypoint.shape
+        bone = np.zeros((M, T, V, C), dtype=np.float32)
+
+        assert C in [2, 3]
+        for v1, v2 in self.pairs:
+            bone[..., v1, :] = keypoint[..., v1, :] - keypoint[..., v2, :]
+            if C == 3 and self.dataset in ['openpose', 'coco']:
+                score = (keypoint[..., v1, 2] + keypoint[..., v2, 2]) / 2
+                bone[..., v1, 2] = score
+
+        results[self.target] = bone
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'dataset={self.dataset}, '
+                    f'target={self.target})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ToMotion(BaseTransform):
+    """Convert the joint information or bone information to corresponding
+    motion information.
+
+    Required Keys:
+
+        - keypoint
+
+    Added Keys:
+
+        - motion
+
+    Args:
+        dataset (str): Define the type of dataset: 'nturgb+d', 'openpose',
+            'coco'. Defaults to ``'nturgb+d'``.
+        source (str): The source key for the joint or bone information.
+            Defaults to ``'keypoint'``.
+        target (str): The target key for the motion information.
+            Defaults to ``'motion'``.
+    """
+
+    def __init__(self,
+                 dataset: str = 'nturgb+d',
+                 source: str = 'keypoint',
+                 target: str = 'motion') -> None:
+        self.dataset = dataset
+        self.source = source
+        self.target = target
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`ToMotion`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        data = results[self.source]
+        M, T, V, C = data.shape
+        motion = np.zeros_like(data)
+
+        assert C in [2, 3]
+        motion[:, :T - 1] = np.diff(data, axis=1)
+        if C == 3 and self.dataset in ['openpose', 'coco']:
+            score = (data[:, :T - 1, :, 2] + data[:, 1:, :, 2]) / 2
+            motion[:, :T - 1, :, 2] = score
+
+        results[self.target] = motion
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'dataset={self.dataset}, '
+                    f'source={self.source}, '
+                    f'target={self.target})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MergeSkeFeat(BaseTransform):
+    """Merge multi-stream features.
+
+    Args:
+        feat_list (list[str]): The list of the keys of features.
+            Defaults to ``['keypoint']``.
+        target (str): The target key for the merged multi-stream information.
+            Defaults to ``'keypoint'``.
+        axis (int): The axis along which the features will be joined.
+            Defaults to -1.
+    """
+
+    def __init__(self,
+                 feat_list: List[str] = ['keypoint'],
+                 target: str = 'keypoint',
+                 axis: int = -1) -> None:
+        self.feat_list = feat_list
+        self.target = target
+        self.axis = axis
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`MergeSkeFeat`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        feats = []
+        for name in self.feat_list:
+            feats.append(results.pop(name))
+        feats = np.concatenate(feats, axis=self.axis)
+        results[self.target] = feats
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'feat_list={self.feat_list}, '
+                    f'target={self.target}, '
+                    f'axis={self.axis})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class GenSkeFeat(BaseTransform):
+    """Unified interface for generating multi-stream skeleton features.
+
+    Required Keys:
+
+        - keypoint
+        - keypoint_score (optional)
+
+    Args:
+        dataset (str): Define the type of dataset: 'nturgb+d', 'openpose',
+            'coco'. Defaults to ``'nturgb+d'``.
+        feats (list[str]): The list of the keys of features.
+            Defaults to ``['j']``.
+        axis (int): The axis along which the features will be joined.
+            Defaults to -1.
+    """
+
+    def __init__(self,
+                 dataset: str = 'nturgb+d',
+                 feats: List[str] = ['j'],
+                 axis: int = -1) -> None:
+        self.dataset = dataset
+        self.feats = feats
+        self.axis = axis
+        ops = []
+        if 'b' in feats or 'bm' in feats:
+            ops.append(JointToBone(dataset=dataset, target='b'))
+        ops.append(KeyMapper(remapping={'keypoint': 'j'}))
+        if 'jm' in feats:
+            ops.append(ToMotion(dataset=dataset, source='j', target='jm'))
+        if 'bm' in feats:
+            ops.append(ToMotion(dataset=dataset, source='b', target='bm'))
+        ops.append(MergeSkeFeat(feat_list=feats, axis=axis))
+        self.ops = Compose(ops)
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`GenSkeFeat`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        if 'keypoint_score' in results and 'keypoint' in results:
+            assert self.dataset != 'nturgb+d'
+            assert results['keypoint'].shape[
+                -1] == 2, 'Only 2D keypoints have keypoint_score. '
+            keypoint = results.pop('keypoint')
+            keypoint_score = results.pop('keypoint_score')
+            results['keypoint'] = np.concatenate(
+                [keypoint, keypoint_score[..., None]], -1)
+        return self.ops(results)
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'dataset={self.dataset}, '
+                    f'feats={self.feats}, '
+                    f'axis={self.axis})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class UniformSampleFrames(BaseTransform):
+    """Uniformly sample frames from the video.
+
+    To sample an n-frame clip from the video. UniformSampleFrames basically
+    divide the video into n segments of equal length and randomly sample one
+    frame from each segment. To make the testing results reproducible, a
+    random seed is set during testing, to make the sampling results
+    deterministic.
+
+    Required Keys:
+
+        - total_frames
+        - start_index (optional)
+
+    Added Keys:
+
+        - frame_inds
+        - frame_interval
+        - num_clips
+        - clip_len
+
+    Args:
+        clip_len (int): Frames of each sampled output clip.
+        num_clips (int): Number of clips to be sampled. Defaults to 1.
+        test_mode (bool): Store True when building test or validation dataset.
+            Defaults to False.
+        seed (int): The random seed used during test time. Defaults to 255.
+    """
+
+    def __init__(self,
+                 clip_len: int,
+                 num_clips: int = 1,
+                 test_mode: bool = False,
+                 seed: int = 255) -> None:
+        self.clip_len = clip_len
+        self.num_clips = num_clips
+        self.test_mode = test_mode
+        self.seed = seed
+
+    def _get_train_clips(self, num_frames: int, clip_len: int) -> np.ndarray:
+        """Uniformly sample indices for training clips.
+
+        Args:
+            num_frames (int): The number of frames.
+            clip_len (int): The length of the clip.
+
+        Returns:
+            np.ndarray: The sampled indices for training clips.
+        """
+        all_inds = []
+        for clip_idx in range(self.num_clips):
+            if num_frames < clip_len:
+                start = np.random.randint(0, num_frames)
+                inds = np.arange(start, start + clip_len)
+            elif clip_len <= num_frames < 2 * clip_len:
+                basic = np.arange(clip_len)
+                inds = np.random.choice(
+                    clip_len + 1, num_frames - clip_len, replace=False)
+                offset = np.zeros(clip_len + 1, dtype=np.int32)
+                offset[inds] = 1
+                offset = np.cumsum(offset)
+                inds = basic + offset[:-1]
+            else:
+                bids = np.array(
+                    [i * num_frames // clip_len for i in range(clip_len + 1)])
+                bsize = np.diff(bids)
+                bst = bids[:clip_len]
+                offset = np.random.randint(bsize)
+                inds = bst + offset
+
+            all_inds.append(inds)
+
+        return np.concatenate(all_inds)
+
+    def _get_test_clips(self, num_frames: int, clip_len: int) -> np.ndarray:
+        """Uniformly sample indices for testing clips.
+
+        Args:
+            num_frames (int): The number of frames.
+            clip_len (int): The length of the clip.
+
+        Returns:
+            np.ndarray: The sampled indices for testing clips.
+        """
+
+        np.random.seed(self.seed)
+        all_inds = []
+        for i in range(self.num_clips):
+            if num_frames < clip_len:
+                start_ind = i if num_frames < self.num_clips \
+                    else i * num_frames // self.num_clips
+                inds = np.arange(start_ind, start_ind + clip_len)
+            elif clip_len <= num_frames < clip_len * 2:
+                basic = np.arange(clip_len)
+                inds = np.random.choice(
+                    clip_len + 1, num_frames - clip_len, replace=False)
+                offset = np.zeros(clip_len + 1, dtype=np.int64)
+                offset[inds] = 1
+                offset = np.cumsum(offset)
+                inds = basic + offset[:-1]
+            else:
+                bids = np.array(
+                    [i * num_frames // clip_len for i in range(clip_len + 1)])
+                bsize = np.diff(bids)
+                bst = bids[:clip_len]
+                offset = np.random.randint(bsize)
+                inds = bst + offset
+
+            all_inds.append(inds)
+
+        return np.concatenate(all_inds)
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`UniformSampleFrames`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        num_frames = results['total_frames']
+
+        if self.test_mode:
+            inds = self._get_test_clips(num_frames, self.clip_len)
+        else:
+            inds = self._get_train_clips(num_frames, self.clip_len)
+
+        inds = np.mod(inds, num_frames)
+        start_index = results.get('start_index', 0)
+        inds = inds + start_index
+
+        if 'keypoint' in results:
+            kp = results['keypoint']
+            assert num_frames == kp.shape[1]
+            num_person = kp.shape[0]
+            num_persons = [num_person] * num_frames
+            for i in range(num_frames):
+                j = num_person - 1
+                while j >= 0 and np.all(np.abs(kp[j, i]) < 1e-5):
+                    j -= 1
+                num_persons[i] = j + 1
+            transitional = [False] * num_frames
+            for i in range(1, num_frames - 1):
+                if num_persons[i] != num_persons[i - 1]:
+                    transitional[i] = transitional[i - 1] = True
+                if num_persons[i] != num_persons[i + 1]:
+                    transitional[i] = transitional[i + 1] = True
+            inds_int = inds.astype(np.int64)
+            coeff = np.array([transitional[i] for i in inds_int])
+            inds = (coeff * inds_int + (1 - coeff) * inds).astype(np.float32)
+
+        results['frame_inds'] = inds.astype(np.int32)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = None
+        results['num_clips'] = self.num_clips
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'num_clips={self.num_clips}, '
+                    f'test_mode={self.test_mode}, '
+                    f'seed={self.seed})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PadTo(BaseTransform):
+    """Sample frames from the video.
+
+    To sample an n-frame clip from the video, PadTo samples
+    the frames from zero index, and loop or zero pad the frames
+    if the length of video frames is less than the value of `length`.
+
+    Required Keys:
+
+        - keypoint
+        - total_frames
+        - start_index (optional)
+
+    Modified Keys:
+
+        - keypoint
+        - total_frames
+
+    Args:
+        length (int): The maximum length of the sampled output clip.
+        mode (str): The padding mode. Defaults to ``'loop'``.
+    """
+
+    def __init__(self, length: int, mode: str = 'loop') -> None:
+        self.length = length
+        assert mode in ['loop', 'zero']
+        self.mode = mode
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`PadTo`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        total_frames = results['total_frames']
+        assert total_frames <= self.length
+        start_index = results.get('start_index', 0)
+        inds = np.arange(start_index, start_index + self.length)
+        inds = np.mod(inds, total_frames)
+
+        keypoint = results['keypoint'][:, inds].copy()
+        if self.mode == 'zero':
+            keypoint[:, total_frames:] = 0
+
+        results['keypoint'] = keypoint
+        results['total_frames'] = self.length
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'length={self.length}, '
+                    f'mode={self.mode})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PoseDecode(BaseTransform):
+    """Load and decode pose with given indices.
+
+    Required Keys:
+
+        - keypoint
+        - total_frames (optional)
+        - frame_inds (optional)
+        - offset (optional)
+        - keypoint_score (optional)
+
+    Modified Keys:
+
+        - keypoint
+        - keypoint_score (optional)
+    """
+
+    @staticmethod
+    def _load_kp(kp: np.ndarray, frame_inds: np.ndarray) -> np.ndarray:
+        """Load keypoints according to sampled indexes."""
+        return kp[:, frame_inds].astype(np.float32)
+
+    @staticmethod
+    def _load_kpscore(kpscore: np.ndarray,
+                      frame_inds: np.ndarray) -> np.ndarray:
+        """Load keypoint scores according to sampled indexes."""
+        return kpscore[:, frame_inds].astype(np.float32)
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`PoseDecode`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        if 'total_frames' not in results:
+            results['total_frames'] = results['keypoint'].shape[1]
+
+        if 'frame_inds' not in results:
+            results['frame_inds'] = np.arange(results['total_frames'])
+
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        offset = results.get('offset', 0)
+        frame_inds = results['frame_inds'] + offset
+
+        if 'keypoint_score' in results:
+            results['keypoint_score'] = self._load_kpscore(
+                results['keypoint_score'], frame_inds)
+
+        results['keypoint'] = self._load_kp(results['keypoint'], frame_inds)
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = f'{self.__class__.__name__}()'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MMUniformSampleFrames(UniformSampleFrames):
+    """Uniformly sample frames from the multi-modal data."""
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`MMUniformSampleFrames`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        num_frames = results['total_frames']
+        modalities = []
+        for modality, clip_len in self.clip_len.items():
+            if self.test_mode:
+                inds = self._get_test_clips(num_frames, clip_len)
+            else:
+                inds = self._get_train_clips(num_frames, clip_len)
+            inds = np.mod(inds, num_frames)
+            results[f'{modality}_inds'] = inds.astype(np.int32)
+            modalities.append(modality)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = None
+        results['num_clips'] = self.num_clips
+        if not isinstance(results['modality'], list):
+            # should override
+            results['modality'] = modalities
+        return results
+
+
+@TRANSFORMS.register_module()
+class MMDecode(DecordInit, DecordDecode, PoseDecode):
+    """Decode RGB videos and skeletons."""
+
+    def __init__(self, io_backend: str = 'disk', **kwargs) -> None:
+        DecordInit.__init__(self, io_backend=io_backend, **kwargs)
+        DecordDecode.__init__(self)
+        self.io_backend = io_backend
+        self.kwargs = kwargs
+        self.file_client = None
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`MMDecode`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        for mod in results['modality']:
+            if results[f'{mod}_inds'].ndim != 1:
+                results[f'{mod}_inds'] = np.squeeze(results[f'{mod}_inds'])
+            frame_inds = results[f'{mod}_inds']
+            if mod == 'RGB':
+                if 'filename' not in results:
+                    results['filename'] = results['frame_dir'] + '.mp4'
+                video_reader = self._get_video_reader(results['filename'])
+                imgs = self._decord_load_frames(video_reader, frame_inds)
+                del video_reader
+                results['imgs'] = imgs
+            elif mod == 'Pose':
+                assert 'keypoint' in results
+                if 'keypoint_score' not in results:
+                    keypoint_score = [
+                        np.ones(keypoint.shape[:-1], dtype=np.float32)
+                        for keypoint in results['keypoint']
+                    ]
+                    results['keypoint_score'] = np.stack(keypoint_score)
+                results['keypoint'] = self._load_kp(results['keypoint'],
+                                                    frame_inds)
+                results['keypoint_score'] = self._load_kpscore(
+                    results['keypoint_score'], frame_inds)
+            else:
+                raise NotImplementedError(
+                    f'MMDecode: Modality {mod} not supported')
+
+        # We need to scale human keypoints to the new image size
+        if 'imgs' in results and 'keypoint' in results:
+            real_img_shape = results['imgs'][0].shape[:2]
+            if real_img_shape != results['img_shape']:
+                oh, ow = results['img_shape']
+                nh, nw = real_img_shape
+
+                assert results['keypoint'].shape[-1] in [2, 3]
+                results['keypoint'][..., 0] *= (nw / ow)
+                results['keypoint'][..., 1] *= (nh / oh)
+                results['img_shape'] = real_img_shape
+                results['original_shape'] = real_img_shape
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'io_backend={self.io_backend})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MMCompact(BaseTransform):
+    """Convert the coordinates of keypoints and crop the images to make them
+    more compact.
+
+    Required Keys:
+
+        - imgs
+        - keypoint
+        - img_shape
+
+    Modified Keys:
+
+        - imgs
+        - keypoint
+        - img_shape
+
+    Args:
+        padding (float): The padding size. Defaults to 0.25.
+        threshold (int): The threshold for the tight bounding box. If the width
+            or height of the tight bounding box is smaller than the threshold,
+            we do not perform the compact operation. Defaults to 10.
+        hw_ratio (float | tuple[float]): The hw_ratio of the expanded
+            box. Float indicates the specific ratio and tuple indicates a
+            ratio range. If set as None, it means there is no requirement on
+            hw_ratio. Defaults to 1.
+        allow_imgpad (bool): Whether to allow expanding the box outside the
+            image to meet the hw_ratio requirement. Defaults to True.
+    """
+
+    def __init__(self,
+                 padding: float = 0.25,
+                 threshold: int = 10,
+                 hw_ratio: Union[float, Tuple[float]] = 1,
+                 allow_imgpad: bool = True) -> None:
+
+        self.padding = padding
+        self.threshold = threshold
+        if hw_ratio is not None:
+            hw_ratio = _pair(hw_ratio)
+        self.hw_ratio = hw_ratio
+        self.allow_imgpad = allow_imgpad
+        assert self.padding >= 0
+
+    def _get_box(self, keypoint: np.ndarray, img_shape: Tuple[int]) -> Tuple:
+        """Calculate the bounding box surrounding all joints in the frames."""
+        h, w = img_shape
+
+        kp_x = keypoint[..., 0]
+        kp_y = keypoint[..., 1]
+
+        min_x = np.min(kp_x[kp_x != 0], initial=np.Inf)
+        min_y = np.min(kp_y[kp_y != 0], initial=np.Inf)
+        max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf)
+        max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf)
+
+        # The compact area is too small
+        if max_x - min_x < self.threshold or max_y - min_y < self.threshold:
+            return 0, 0, w, h
+
+        center = ((max_x + min_x) / 2, (max_y + min_y) / 2)
+        half_width = (max_x - min_x) / 2 * (1 + self.padding)
+        half_height = (max_y - min_y) / 2 * (1 + self.padding)
+
+        if self.hw_ratio is not None:
+            half_height = max(self.hw_ratio[0] * half_width, half_height)
+            half_width = max(1 / self.hw_ratio[1] * half_height, half_width)
+
+        min_x, max_x = center[0] - half_width, center[0] + half_width
+        min_y, max_y = center[1] - half_height, center[1] + half_height
+
+        # hot update
+        if not self.allow_imgpad:
+            min_x, min_y = int(max(0, min_x)), int(max(0, min_y))
+            max_x, max_y = int(min(w, max_x)), int(min(h, max_y))
+        else:
+            min_x, min_y = int(min_x), int(min_y)
+            max_x, max_y = int(max_x), int(max_y)
+        return min_x, min_y, max_x, max_y
+
+    def _compact_images(self, imgs: List[np.ndarray], img_shape: Tuple[int],
+                        box: Tuple[int]) -> List:
+        """Crop the images acoordding the bounding box."""
+        h, w = img_shape
+        min_x, min_y, max_x, max_y = box
+        pad_l, pad_u, pad_r, pad_d = 0, 0, 0, 0
+        if min_x < 0:
+            pad_l = -min_x
+            min_x, max_x = 0, max_x + pad_l
+            w += pad_l
+        if min_y < 0:
+            pad_u = -min_y
+            min_y, max_y = 0, max_y + pad_u
+            h += pad_u
+        if max_x > w:
+            pad_r = max_x - w
+            w = max_x
+        if max_y > h:
+            pad_d = max_y - h
+            h = max_y
+
+        if pad_l > 0 or pad_r > 0 or pad_u > 0 or pad_d > 0:
+            imgs = [
+                np.pad(img, ((pad_u, pad_d), (pad_l, pad_r), (0, 0)))
+                for img in imgs
+            ]
+        imgs = [img[min_y:max_y, min_x:max_x] for img in imgs]
+        return imgs
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`MMCompact`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        img_shape = results['img_shape']
+        kp = results['keypoint']
+        # Make NaN zero
+        kp[np.isnan(kp)] = 0.
+        min_x, min_y, max_x, max_y = self._get_box(kp, img_shape)
+
+        kp_x, kp_y = kp[..., 0], kp[..., 1]
+        kp_x[kp_x != 0] -= min_x
+        kp_y[kp_y != 0] -= min_y
+
+        new_shape = (max_y - min_y, max_x - min_x)
+        results['img_shape'] = new_shape
+        results['imgs'] = self._compact_images(results['imgs'], img_shape,
+                                               (min_x, min_y, max_x, max_y))
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}(padding={self.padding}, '
+                    f'threshold={self.threshold}, '
+                    f'hw_ratio={self.hw_ratio}, '
+                    f'allow_imgpad={self.allow_imgpad})')
+        return repr_str
diff --git a/mmaction/datasets/transforms/processing.py b/mmaction/datasets/transforms/processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ba68fac23babeb8f2067ba36622ce3795f86857
--- /dev/null
+++ b/mmaction/datasets/transforms/processing.py
@@ -0,0 +1,1444 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+import warnings
+from numbers import Number
+from typing import Sequence
+
+import cv2
+import mmcv
+import mmengine
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+from torch.nn.modules.utils import _pair
+
+from mmaction.registry import TRANSFORMS
+
+
+def _combine_quadruple(a, b):
+    return a[0] + a[2] * b[0], a[1] + a[3] * b[1], a[2] * b[2], a[3] * b[3]
+
+
+def _flip_quadruple(a):
+    return 1 - a[0] - a[2], a[1], a[2], a[3]
+
+
+def _init_lazy_if_proper(results, lazy):
+    """Initialize lazy operation properly.
+
+    Make sure that a lazy operation is properly initialized,
+    and avoid a non-lazy operation accidentally getting mixed in.
+
+    Required keys in results are "imgs" if "img_shape" not in results,
+    otherwise, Required keys in results are "img_shape", add or modified keys
+    are "img_shape", "lazy".
+    Add or modified keys in "lazy" are "original_shape", "crop_bbox", "flip",
+    "flip_direction", "interpolation".
+
+    Args:
+        results (dict): A dict stores data pipeline result.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    if 'img_shape' not in results:
+        results['img_shape'] = results['imgs'][0].shape[:2]
+    if lazy:
+        if 'lazy' not in results:
+            img_h, img_w = results['img_shape']
+            lazyop = dict()
+            lazyop['original_shape'] = results['img_shape']
+            lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],
+                                           dtype=np.float32)
+            lazyop['flip'] = False
+            lazyop['flip_direction'] = None
+            lazyop['interpolation'] = None
+            results['lazy'] = lazyop
+    else:
+        assert 'lazy' not in results, 'Use Fuse after lazy operations'
+
+
+@TRANSFORMS.register_module()
+class Fuse(BaseTransform):
+    """Fuse lazy operations.
+
+    Fusion order:
+        crop -> resize -> flip
+
+    Required keys are "imgs", "img_shape" and "lazy", added or modified keys
+    are "imgs", "lazy".
+    Required keys in "lazy" are "crop_bbox", "interpolation", "flip_direction".
+    """
+
+    def transform(self, results):
+        """Fuse lazy operations.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        if 'lazy' not in results:
+            raise ValueError('No lazy operation detected')
+        lazyop = results['lazy']
+        imgs = results['imgs']
+
+        # crop
+        left, top, right, bottom = lazyop['crop_bbox'].round().astype(int)
+        imgs = [img[top:bottom, left:right] for img in imgs]
+
+        # resize
+        img_h, img_w = results['img_shape']
+        if lazyop['interpolation'] is None:
+            interpolation = 'bilinear'
+        else:
+            interpolation = lazyop['interpolation']
+        imgs = [
+            mmcv.imresize(img, (img_w, img_h), interpolation=interpolation)
+            for img in imgs
+        ]
+
+        # flip
+        if lazyop['flip']:
+            for img in imgs:
+                mmcv.imflip_(img, lazyop['flip_direction'])
+
+        results['imgs'] = imgs
+        del results['lazy']
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class RandomCrop(BaseTransform):
+    """Vanilla square random crop that specifics the output size.
+
+    Required keys in results are "img_shape", "keypoint" (optional), "imgs"
+    (optional), added or modified keys are "keypoint", "imgs", "lazy"; Required
+    keys in "lazy" are "flip", "crop_bbox", added or modified key is
+    "crop_bbox".
+
+    Args:
+        size (int): The output size of the images.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self, size, lazy=False):
+        if not isinstance(size, int):
+            raise TypeError(f'Size must be an int, but got {type(size)}')
+        self.size = size
+        self.lazy = lazy
+
+    @staticmethod
+    def _crop_kps(kps, crop_bbox):
+        """Static method for cropping keypoint."""
+        return kps - crop_bbox[:2]
+
+    @staticmethod
+    def _crop_imgs(imgs, crop_bbox):
+        """Static method for cropping images."""
+        x1, y1, x2, y2 = crop_bbox
+        return [img[y1:y2, x1:x2] for img in imgs]
+
+    @staticmethod
+    def _box_crop(box, crop_bbox):
+        """Crop the bounding boxes according to the crop_bbox.
+
+        Args:
+            box (np.ndarray): The bounding boxes.
+            crop_bbox(np.ndarray): The bbox used to crop the original image.
+        """
+
+        x1, y1, x2, y2 = crop_bbox
+        img_w, img_h = x2 - x1, y2 - y1
+
+        box_ = box.copy()
+        box_[..., 0::2] = np.clip(box[..., 0::2] - x1, 0, img_w - 1)
+        box_[..., 1::2] = np.clip(box[..., 1::2] - y1, 0, img_h - 1)
+        return box_
+
+    def _all_box_crop(self, results, crop_bbox):
+        """Crop the gt_bboxes and proposals in results according to crop_bbox.
+
+        Args:
+            results (dict): All information about the sample, which contain
+                'gt_bboxes' and 'proposals' (optional).
+            crop_bbox(np.ndarray): The bbox used to crop the original image.
+        """
+        results['gt_bboxes'] = self._box_crop(results['gt_bboxes'], crop_bbox)
+        if 'proposals' in results and results['proposals'] is not None:
+            assert results['proposals'].shape[1] == 4
+            results['proposals'] = self._box_crop(results['proposals'],
+                                                  crop_bbox)
+        return results
+
+    def transform(self, results):
+        """Performs the RandomCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        if 'keypoint' in results:
+            assert not self.lazy, ('Keypoint Augmentations are not compatible '
+                                   'with lazy == True')
+
+        img_h, img_w = results['img_shape']
+        assert self.size <= img_h and self.size <= img_w
+
+        y_offset = 0
+        x_offset = 0
+        if img_h > self.size:
+            y_offset = int(np.random.randint(0, img_h - self.size))
+        if img_w > self.size:
+            x_offset = int(np.random.randint(0, img_w - self.size))
+
+        if 'crop_quadruple' not in results:
+            results['crop_quadruple'] = np.array(
+                [0, 0, 1, 1],  # x, y, w, h
+                dtype=np.float32)
+
+        x_ratio, y_ratio = x_offset / img_w, y_offset / img_h
+        w_ratio, h_ratio = self.size / img_w, self.size / img_h
+
+        old_crop_quadruple = results['crop_quadruple']
+        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
+        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
+        new_crop_quadruple = [
+            old_x_ratio + x_ratio * old_w_ratio,
+            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
+            h_ratio * old_h_ratio
+        ]
+        results['crop_quadruple'] = np.array(
+            new_crop_quadruple, dtype=np.float32)
+
+        new_h, new_w = self.size, self.size
+
+        crop_bbox = np.array(
+            [x_offset, y_offset, x_offset + new_w, y_offset + new_h])
+        results['crop_bbox'] = crop_bbox
+
+        results['img_shape'] = (new_h, new_w)
+
+        if not self.lazy:
+            if 'keypoint' in results:
+                results['keypoint'] = self._crop_kps(results['keypoint'],
+                                                     crop_bbox)
+            if 'imgs' in results:
+                results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox)
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+
+            # record crop_bbox in lazyop dict to ensure only crop once in Fuse
+            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
+            left = x_offset * (lazy_right - lazy_left) / img_w
+            right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w
+            top = y_offset * (lazy_bottom - lazy_top) / img_h
+            bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h
+            lazyop['crop_bbox'] = np.array([(lazy_left + left),
+                                            (lazy_top + top),
+                                            (lazy_left + right),
+                                            (lazy_top + bottom)],
+                                           dtype=np.float32)
+
+        # Process entity boxes
+        if 'gt_bboxes' in results:
+            assert not self.lazy
+            results = self._all_box_crop(results, results['crop_bbox'])
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}(size={self.size}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomResizedCrop(RandomCrop):
+    """Random crop that specifics the area and height-weight ratio range.
+
+    Required keys in results are "img_shape", "crop_bbox", "imgs" (optional),
+    "keypoint" (optional), added or modified keys are "imgs", "keypoint",
+    "crop_bbox" and "lazy"; Required keys in "lazy" are "flip", "crop_bbox",
+    added or modified key is "crop_bbox".
+
+    Args:
+        area_range (Tuple[float]): The candidate area scales range of
+            output cropped images. Default: (0.08, 1.0).
+        aspect_ratio_range (Tuple[float]): The candidate aspect ratio range of
+            output cropped images. Default: (3 / 4, 4 / 3).
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self,
+                 area_range=(0.08, 1.0),
+                 aspect_ratio_range=(3 / 4, 4 / 3),
+                 lazy=False):
+        self.area_range = area_range
+        self.aspect_ratio_range = aspect_ratio_range
+        self.lazy = lazy
+        if not mmengine.is_tuple_of(self.area_range, float):
+            raise TypeError(f'Area_range must be a tuple of float, '
+                            f'but got {type(area_range)}')
+        if not mmengine.is_tuple_of(self.aspect_ratio_range, float):
+            raise TypeError(f'Aspect_ratio_range must be a tuple of float, '
+                            f'but got {type(aspect_ratio_range)}')
+
+    @staticmethod
+    def get_crop_bbox(img_shape,
+                      area_range,
+                      aspect_ratio_range,
+                      max_attempts=10):
+        """Get a crop bbox given the area range and aspect ratio range.
+
+        Args:
+            img_shape (Tuple[int]): Image shape
+            area_range (Tuple[float]): The candidate area scales range of
+                output cropped images. Default: (0.08, 1.0).
+            aspect_ratio_range (Tuple[float]): The candidate aspect
+                ratio range of output cropped images. Default: (3 / 4, 4 / 3).
+                max_attempts (int): The maximum of attempts. Default: 10.
+            max_attempts (int): Max attempts times to generate random candidate
+                bounding box. If it doesn't qualified one, the center bounding
+                box will be used.
+        Returns:
+            (list[int]) A random crop bbox within the area range and aspect
+            ratio range.
+        """
+        assert 0 < area_range[0] <= area_range[1] <= 1
+        assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1]
+
+        img_h, img_w = img_shape
+        area = img_h * img_w
+
+        min_ar, max_ar = aspect_ratio_range
+        aspect_ratios = np.exp(
+            np.random.uniform(
+                np.log(min_ar), np.log(max_ar), size=max_attempts))
+        target_areas = np.random.uniform(*area_range, size=max_attempts) * area
+        candidate_crop_w = np.round(np.sqrt(target_areas *
+                                            aspect_ratios)).astype(np.int32)
+        candidate_crop_h = np.round(np.sqrt(target_areas /
+                                            aspect_ratios)).astype(np.int32)
+
+        for i in range(max_attempts):
+            crop_w = candidate_crop_w[i]
+            crop_h = candidate_crop_h[i]
+            if crop_h <= img_h and crop_w <= img_w:
+                x_offset = random.randint(0, img_w - crop_w)
+                y_offset = random.randint(0, img_h - crop_h)
+                return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h
+
+        # Fallback
+        crop_size = min(img_h, img_w)
+        x_offset = (img_w - crop_size) // 2
+        y_offset = (img_h - crop_size) // 2
+        return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size
+
+    def transform(self, results):
+        """Performs the RandomResizeCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        if 'keypoint' in results:
+            assert not self.lazy, ('Keypoint Augmentations are not compatible '
+                                   'with lazy == True')
+
+        img_h, img_w = results['img_shape']
+
+        left, top, right, bottom = self.get_crop_bbox(
+            (img_h, img_w), self.area_range, self.aspect_ratio_range)
+        new_h, new_w = bottom - top, right - left
+
+        if 'crop_quadruple' not in results:
+            results['crop_quadruple'] = np.array(
+                [0, 0, 1, 1],  # x, y, w, h
+                dtype=np.float32)
+
+        x_ratio, y_ratio = left / img_w, top / img_h
+        w_ratio, h_ratio = new_w / img_w, new_h / img_h
+
+        old_crop_quadruple = results['crop_quadruple']
+        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
+        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
+        new_crop_quadruple = [
+            old_x_ratio + x_ratio * old_w_ratio,
+            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
+            h_ratio * old_h_ratio
+        ]
+        results['crop_quadruple'] = np.array(
+            new_crop_quadruple, dtype=np.float32)
+
+        crop_bbox = np.array([left, top, right, bottom])
+        results['crop_bbox'] = crop_bbox
+        results['img_shape'] = (new_h, new_w)
+
+        if not self.lazy:
+            if 'keypoint' in results:
+                results['keypoint'] = self._crop_kps(results['keypoint'],
+                                                     crop_bbox)
+            if 'imgs' in results:
+                results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox)
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+
+            # record crop_bbox in lazyop dict to ensure only crop once in Fuse
+            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
+            left = left * (lazy_right - lazy_left) / img_w
+            right = right * (lazy_right - lazy_left) / img_w
+            top = top * (lazy_bottom - lazy_top) / img_h
+            bottom = bottom * (lazy_bottom - lazy_top) / img_h
+            lazyop['crop_bbox'] = np.array([(lazy_left + left),
+                                            (lazy_top + top),
+                                            (lazy_left + right),
+                                            (lazy_top + bottom)],
+                                           dtype=np.float32)
+
+        if 'gt_bboxes' in results:
+            assert not self.lazy
+            results = self._all_box_crop(results, results['crop_bbox'])
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'area_range={self.area_range}, '
+                    f'aspect_ratio_range={self.aspect_ratio_range}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MultiScaleCrop(RandomCrop):
+    """Crop images with a list of randomly selected scales.
+
+    Randomly select the w and h scales from a list of scales. Scale of 1 means
+    the base size, which is the minimal of image width and height. The scale
+    level of w and h is controlled to be smaller than a certain value to
+    prevent too large or small aspect ratio.
+
+    Required keys are "img_shape", "imgs" (optional), "keypoint" (optional),
+    added or modified keys are "imgs", "crop_bbox", "img_shape", "lazy" and
+    "scales". Required keys in "lazy" are "crop_bbox", added or modified key is
+    "crop_bbox".
+
+    Args:
+        input_size (int | tuple[int]): (w, h) of network input.
+        scales (tuple[float]): width and height scales to be selected.
+        max_wh_scale_gap (int): Maximum gap of w and h scale levels.
+            Default: 1.
+        random_crop (bool): If set to True, the cropping bbox will be randomly
+            sampled, otherwise it will be sampler from fixed regions.
+            Default: False.
+        num_fixed_crops (int): If set to 5, the cropping bbox will keep 5
+            basic fixed regions: "upper left", "upper right", "lower left",
+            "lower right", "center". If set to 13, the cropping bbox will
+            append another 8 fix regions: "center left", "center right",
+            "lower center", "upper center", "upper left quarter",
+            "upper right quarter", "lower left quarter", "lower right quarter".
+            Default: 5.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self,
+                 input_size,
+                 scales=(1, ),
+                 max_wh_scale_gap=1,
+                 random_crop=False,
+                 num_fixed_crops=5,
+                 lazy=False):
+        self.input_size = _pair(input_size)
+        if not mmengine.is_tuple_of(self.input_size, int):
+            raise TypeError(f'Input_size must be int or tuple of int, '
+                            f'but got {type(input_size)}')
+
+        if not isinstance(scales, tuple):
+            raise TypeError(f'Scales must be tuple, but got {type(scales)}')
+
+        if num_fixed_crops not in [5, 13]:
+            raise ValueError(f'Num_fix_crops must be in {[5, 13]}, '
+                             f'but got {num_fixed_crops}')
+
+        self.scales = scales
+        self.max_wh_scale_gap = max_wh_scale_gap
+        self.random_crop = random_crop
+        self.num_fixed_crops = num_fixed_crops
+        self.lazy = lazy
+
+    def transform(self, results):
+        """Performs the MultiScaleCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        if 'keypoint' in results:
+            assert not self.lazy, ('Keypoint Augmentations are not compatible '
+                                   'with lazy == True')
+
+        img_h, img_w = results['img_shape']
+        base_size = min(img_h, img_w)
+        crop_sizes = [int(base_size * s) for s in self.scales]
+
+        candidate_sizes = []
+        for i, h in enumerate(crop_sizes):
+            for j, w in enumerate(crop_sizes):
+                if abs(i - j) <= self.max_wh_scale_gap:
+                    candidate_sizes.append([w, h])
+
+        crop_size = random.choice(candidate_sizes)
+        for i in range(2):
+            if abs(crop_size[i] - self.input_size[i]) < 3:
+                crop_size[i] = self.input_size[i]
+
+        crop_w, crop_h = crop_size
+
+        if self.random_crop:
+            x_offset = random.randint(0, img_w - crop_w)
+            y_offset = random.randint(0, img_h - crop_h)
+        else:
+            w_step = (img_w - crop_w) // 4
+            h_step = (img_h - crop_h) // 4
+            candidate_offsets = [
+                (0, 0),  # upper left
+                (4 * w_step, 0),  # upper right
+                (0, 4 * h_step),  # lower left
+                (4 * w_step, 4 * h_step),  # lower right
+                (2 * w_step, 2 * h_step),  # center
+            ]
+            if self.num_fixed_crops == 13:
+                extra_candidate_offsets = [
+                    (0, 2 * h_step),  # center left
+                    (4 * w_step, 2 * h_step),  # center right
+                    (2 * w_step, 4 * h_step),  # lower center
+                    (2 * w_step, 0 * h_step),  # upper center
+                    (1 * w_step, 1 * h_step),  # upper left quarter
+                    (3 * w_step, 1 * h_step),  # upper right quarter
+                    (1 * w_step, 3 * h_step),  # lower left quarter
+                    (3 * w_step, 3 * h_step)  # lower right quarter
+                ]
+                candidate_offsets.extend(extra_candidate_offsets)
+            x_offset, y_offset = random.choice(candidate_offsets)
+
+        new_h, new_w = crop_h, crop_w
+
+        crop_bbox = np.array(
+            [x_offset, y_offset, x_offset + new_w, y_offset + new_h])
+        results['crop_bbox'] = crop_bbox
+        results['img_shape'] = (new_h, new_w)
+        results['scales'] = self.scales
+
+        if 'crop_quadruple' not in results:
+            results['crop_quadruple'] = np.array(
+                [0, 0, 1, 1],  # x, y, w, h
+                dtype=np.float32)
+
+        x_ratio, y_ratio = x_offset / img_w, y_offset / img_h
+        w_ratio, h_ratio = new_w / img_w, new_h / img_h
+
+        old_crop_quadruple = results['crop_quadruple']
+        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
+        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
+        new_crop_quadruple = [
+            old_x_ratio + x_ratio * old_w_ratio,
+            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
+            h_ratio * old_h_ratio
+        ]
+        results['crop_quadruple'] = np.array(
+            new_crop_quadruple, dtype=np.float32)
+
+        if not self.lazy:
+            if 'keypoint' in results:
+                results['keypoint'] = self._crop_kps(results['keypoint'],
+                                                     crop_bbox)
+            if 'imgs' in results:
+                results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox)
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+
+            # record crop_bbox in lazyop dict to ensure only crop once in Fuse
+            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
+            left = x_offset * (lazy_right - lazy_left) / img_w
+            right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w
+            top = y_offset * (lazy_bottom - lazy_top) / img_h
+            bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h
+            lazyop['crop_bbox'] = np.array([(lazy_left + left),
+                                            (lazy_top + top),
+                                            (lazy_left + right),
+                                            (lazy_top + bottom)],
+                                           dtype=np.float32)
+
+        if 'gt_bboxes' in results:
+            assert not self.lazy
+            results = self._all_box_crop(results, results['crop_bbox'])
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'input_size={self.input_size}, scales={self.scales}, '
+                    f'max_wh_scale_gap={self.max_wh_scale_gap}, '
+                    f'random_crop={self.random_crop}, '
+                    f'num_fixed_crops={self.num_fixed_crops}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Resize(BaseTransform):
+    """Resize images to a specific size.
+
+    Required keys are "img_shape", "modality", "imgs" (optional), "keypoint"
+    (optional), added or modified keys are "imgs", "img_shape", "keep_ratio",
+    "scale_factor", "lazy", "resize_size". Required keys in "lazy" is None,
+    added or modified key is "interpolation".
+
+    Args:
+        scale (float | Tuple[int]): If keep_ratio is True, it serves as scaling
+            factor or maximum size:
+            If it is a float number, the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, the image will
+            be rescaled as large as possible within the scale.
+            Otherwise, it serves as (w, h) of output size.
+        keep_ratio (bool): If set to True, Images will be resized without
+            changing the aspect ratio. Otherwise, it will resize images to a
+            given size. Default: True.
+        interpolation (str): Algorithm used for interpolation,
+            accepted values are "nearest", "bilinear", "bicubic", "area",
+            "lanczos". Default: "bilinear".
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self,
+                 scale,
+                 keep_ratio=True,
+                 interpolation='bilinear',
+                 lazy=False):
+        if isinstance(scale, float):
+            if scale <= 0:
+                raise ValueError(f'Invalid scale {scale}, must be positive.')
+        elif isinstance(scale, tuple):
+            max_long_edge = max(scale)
+            max_short_edge = min(scale)
+            if max_short_edge == -1:
+                # assign np.inf to long edge for rescaling short edge later.
+                scale = (np.inf, max_long_edge)
+        else:
+            raise TypeError(
+                f'Scale must be float or tuple of int, but got {type(scale)}')
+        self.scale = scale
+        self.keep_ratio = keep_ratio
+        self.interpolation = interpolation
+        self.lazy = lazy
+
+    def _resize_imgs(self, imgs, new_w, new_h):
+        """Static method for resizing keypoint."""
+        return [
+            mmcv.imresize(
+                img, (new_w, new_h), interpolation=self.interpolation)
+            for img in imgs
+        ]
+
+    @staticmethod
+    def _resize_kps(kps, scale_factor):
+        """Static method for resizing keypoint."""
+        return kps * scale_factor
+
+    @staticmethod
+    def _box_resize(box, scale_factor):
+        """Rescale the bounding boxes according to the scale_factor.
+
+        Args:
+            box (np.ndarray): The bounding boxes.
+            scale_factor (np.ndarray): The scale factor used for rescaling.
+        """
+        assert len(scale_factor) == 2
+        scale_factor = np.concatenate([scale_factor, scale_factor])
+        return box * scale_factor
+
+    def transform(self, results):
+        """Performs the Resize augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+
+        _init_lazy_if_proper(results, self.lazy)
+        if 'keypoint' in results:
+            assert not self.lazy, ('Keypoint Augmentations are not compatible '
+                                   'with lazy == True')
+
+        if 'scale_factor' not in results:
+            results['scale_factor'] = np.array([1, 1], dtype=np.float32)
+        img_h, img_w = results['img_shape']
+
+        if self.keep_ratio:
+            new_w, new_h = mmcv.rescale_size((img_w, img_h), self.scale)
+        else:
+            new_w, new_h = self.scale
+
+        self.scale_factor = np.array([new_w / img_w, new_h / img_h],
+                                     dtype=np.float32)
+
+        results['img_shape'] = (new_h, new_w)
+        results['keep_ratio'] = self.keep_ratio
+        results['scale_factor'] = results['scale_factor'] * self.scale_factor
+
+        if not self.lazy:
+            if 'imgs' in results:
+                results['imgs'] = self._resize_imgs(results['imgs'], new_w,
+                                                    new_h)
+            if 'keypoint' in results:
+                results['keypoint'] = self._resize_kps(results['keypoint'],
+                                                       self.scale_factor)
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+            lazyop['interpolation'] = self.interpolation
+
+        if 'gt_bboxes' in results:
+            assert not self.lazy
+            results['gt_bboxes'] = self._box_resize(results['gt_bboxes'],
+                                                    self.scale_factor)
+            if 'proposals' in results and results['proposals'] is not None:
+                assert results['proposals'].shape[1] == 4
+                results['proposals'] = self._box_resize(
+                    results['proposals'], self.scale_factor)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'scale={self.scale}, keep_ratio={self.keep_ratio}, '
+                    f'interpolation={self.interpolation}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomRescale(BaseTransform):
+    """Randomly resize images so that the short_edge is resized to a specific
+    size in a given range. The scale ratio is unchanged after resizing.
+
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "resize_size",
+    "short_edge".
+
+    Args:
+        scale_range (tuple[int]): The range of short edge length. A closed
+            interval.
+        interpolation (str): Algorithm used for interpolation:
+            "nearest" | "bilinear". Default: "bilinear".
+    """
+
+    def __init__(self, scale_range, interpolation='bilinear'):
+        self.scale_range = scale_range
+        # make sure scale_range is legal, first make sure the type is OK
+        assert mmengine.is_tuple_of(scale_range, int)
+        assert len(scale_range) == 2
+        assert scale_range[0] < scale_range[1]
+        assert np.all([x > 0 for x in scale_range])
+
+        self.keep_ratio = True
+        self.interpolation = interpolation
+
+    def transform(self, results):
+        """Performs the Resize augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        short_edge = np.random.randint(self.scale_range[0],
+                                       self.scale_range[1] + 1)
+        resize = Resize((-1, short_edge),
+                        keep_ratio=True,
+                        interpolation=self.interpolation,
+                        lazy=False)
+        results = resize(results)
+
+        results['short_edge'] = short_edge
+        return results
+
+    def __repr__(self):
+        scale_range = self.scale_range
+        repr_str = (f'{self.__class__.__name__}('
+                    f'scale_range=({scale_range[0]}, {scale_range[1]}), '
+                    f'interpolation={self.interpolation})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Flip(BaseTransform):
+    """Flip the input images with a probability.
+
+    Reverse the order of elements in the given imgs with a specific direction.
+    The shape of the imgs is preserved, but the elements are reordered.
+
+    Required keys are "img_shape", "modality", "imgs" (optional), "keypoint"
+    (optional), added or modified keys are "imgs", "keypoint", "lazy" and
+    "flip_direction". Required keys in "lazy" is None, added or modified key
+    are "flip" and "flip_direction". The Flip augmentation should be placed
+    after any cropping / reshaping augmentations, to make sure crop_quadruple
+    is calculated properly.
+
+    Args:
+        flip_ratio (float): Probability of implementing flip. Default: 0.5.
+        direction (str): Flip imgs horizontally or vertically. Options are
+            "horizontal" | "vertical". Default: "horizontal".
+        flip_label_map (Dict[int, int] | None): Transform the label of the
+            flipped image with the specific label. Default: None.
+        left_kp (list[int]): Indexes of left keypoints, used to flip keypoints.
+            Default: None.
+        right_kp (list[ind]): Indexes of right keypoints, used to flip
+            keypoints. Default: None.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+    _directions = ['horizontal', 'vertical']
+
+    def __init__(self,
+                 flip_ratio=0.5,
+                 direction='horizontal',
+                 flip_label_map=None,
+                 left_kp=None,
+                 right_kp=None,
+                 lazy=False):
+        if direction not in self._directions:
+            raise ValueError(f'Direction {direction} is not supported. '
+                             f'Currently support ones are {self._directions}')
+        self.flip_ratio = flip_ratio
+        self.direction = direction
+        self.flip_label_map = flip_label_map
+        self.left_kp = left_kp
+        self.right_kp = right_kp
+        self.lazy = lazy
+
+    def _flip_imgs(self, imgs, modality):
+        """Utility function for flipping images."""
+        _ = [mmcv.imflip_(img, self.direction) for img in imgs]
+        lt = len(imgs)
+        if modality == 'Flow':
+            # The 1st frame of each 2 frames is flow-x
+            for i in range(0, lt, 2):
+                imgs[i] = mmcv.iminvert(imgs[i])
+        return imgs
+
+    def _flip_kps(self, kps, kpscores, img_width):
+        """Utility function for flipping keypoint."""
+        kp_x = kps[..., 0]
+        kp_x[kp_x != 0] = img_width - kp_x[kp_x != 0]
+        new_order = list(range(kps.shape[2]))
+        if self.left_kp is not None and self.right_kp is not None:
+            for left, right in zip(self.left_kp, self.right_kp):
+                new_order[left] = right
+                new_order[right] = left
+        kps = kps[:, :, new_order]
+        if kpscores is not None:
+            kpscores = kpscores[:, :, new_order]
+        return kps, kpscores
+
+    @staticmethod
+    def _box_flip(box, img_width):
+        """Flip the bounding boxes given the width of the image.
+
+        Args:
+            box (np.ndarray): The bounding boxes.
+            img_width (int): The img width.
+        """
+        box_ = box.copy()
+        box_[..., 0::4] = img_width - box[..., 2::4]
+        box_[..., 2::4] = img_width - box[..., 0::4]
+        return box_
+
+    def transform(self, results):
+        """Performs the Flip augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        if 'keypoint' in results:
+            assert not self.lazy, ('Keypoint Augmentations are not compatible '
+                                   'with lazy == True')
+            assert self.direction == 'horizontal', (
+                'Only horizontal flips are'
+                'supported for human keypoints')
+
+        modality = results['modality']
+        if modality == 'Flow':
+            assert self.direction == 'horizontal'
+
+        flip = np.random.rand() < self.flip_ratio
+
+        results['flip'] = flip
+        results['flip_direction'] = self.direction
+        img_width = results['img_shape'][1]
+
+        if self.flip_label_map is not None and flip:
+            results['label'] = self.flip_label_map.get(results['label'],
+                                                       results['label'])
+
+        if not self.lazy:
+            if flip:
+                if 'imgs' in results:
+                    results['imgs'] = self._flip_imgs(results['imgs'],
+                                                      modality)
+                if 'keypoint' in results:
+                    kp = results['keypoint']
+                    kpscore = results.get('keypoint_score', None)
+                    kp, kpscore = self._flip_kps(kp, kpscore, img_width)
+                    results['keypoint'] = kp
+                    if 'keypoint_score' in results:
+                        results['keypoint_score'] = kpscore
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Use one Flip please')
+            lazyop['flip'] = flip
+            lazyop['flip_direction'] = self.direction
+
+        if 'gt_bboxes' in results and flip:
+            assert not self.lazy and self.direction == 'horizontal'
+            width = results['img_shape'][1]
+            results['gt_bboxes'] = self._box_flip(results['gt_bboxes'], width)
+            if 'proposals' in results and results['proposals'] is not None:
+                assert results['proposals'].shape[1] == 4
+                results['proposals'] = self._box_flip(results['proposals'],
+                                                      width)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (
+            f'{self.__class__.__name__}('
+            f'flip_ratio={self.flip_ratio}, direction={self.direction}, '
+            f'flip_label_map={self.flip_label_map}, lazy={self.lazy})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ColorJitter(BaseTransform):
+    """Perform ColorJitter to each img.
+
+    Required keys are "imgs", added or modified keys are "imgs".
+
+    Args:
+        brightness (float | tuple[float]): The jitter range for brightness, if
+            set as a float, the range will be (1 - brightness, 1 + brightness).
+            Default: 0.5.
+        contrast (float | tuple[float]): The jitter range for contrast, if set
+            as a float, the range will be (1 - contrast, 1 + contrast).
+            Default: 0.5.
+        saturation (float | tuple[float]): The jitter range for saturation, if
+            set as a float, the range will be (1 - saturation, 1 + saturation).
+            Default: 0.5.
+        hue (float | tuple[float]): The jitter range for hue, if set as a
+            float, the range will be (-hue, hue). Default: 0.1.
+    """
+
+    @staticmethod
+    def check_input(val, max, base):
+        if isinstance(val, tuple):
+            assert base - max <= val[0] <= val[1] <= base + max
+            return val
+        assert val <= max
+        return (base - val, base + val)
+
+    @staticmethod
+    def rgb_to_grayscale(img):
+        return 0.2989 * img[..., 0] + 0.587 * img[..., 1] + 0.114 * img[..., 2]
+
+    @staticmethod
+    def adjust_contrast(img, factor):
+        val = np.mean(ColorJitter.rgb_to_grayscale(img))
+        return factor * img + (1 - factor) * val
+
+    @staticmethod
+    def adjust_saturation(img, factor):
+        gray = np.stack([ColorJitter.rgb_to_grayscale(img)] * 3, axis=-1)
+        return factor * img + (1 - factor) * gray
+
+    @staticmethod
+    def adjust_hue(img, factor):
+        img = np.clip(img, 0, 255).astype(np.uint8)
+        hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
+        offset = int(factor * 255)
+        hsv[..., 0] = (hsv[..., 0] + offset) % 180
+        img = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
+        return img.astype(np.float32)
+
+    def __init__(self, brightness=0.5, contrast=0.5, saturation=0.5, hue=0.1):
+        self.brightness = self.check_input(brightness, 1, 1)
+        self.contrast = self.check_input(contrast, 1, 1)
+        self.saturation = self.check_input(saturation, 1, 1)
+        self.hue = self.check_input(hue, 0.5, 0)
+        self.fn_idx = np.random.permutation(4)
+
+    def transform(self, results):
+        """Perform ColorJitter.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        imgs = results['imgs']
+        num_clips, clip_len = 1, len(imgs)
+
+        new_imgs = []
+        for i in range(num_clips):
+            b = np.random.uniform(
+                low=self.brightness[0], high=self.brightness[1])
+            c = np.random.uniform(low=self.contrast[0], high=self.contrast[1])
+            s = np.random.uniform(
+                low=self.saturation[0], high=self.saturation[1])
+            h = np.random.uniform(low=self.hue[0], high=self.hue[1])
+            start, end = i * clip_len, (i + 1) * clip_len
+
+            for img in imgs[start:end]:
+                img = img.astype(np.float32)
+                for fn_id in self.fn_idx:
+                    if fn_id == 0 and b != 1:
+                        img *= b
+                    if fn_id == 1 and c != 1:
+                        img = self.adjust_contrast(img, c)
+                    if fn_id == 2 and s != 1:
+                        img = self.adjust_saturation(img, s)
+                    if fn_id == 3 and h != 0:
+                        img = self.adjust_hue(img, h)
+                img = np.clip(img, 0, 255).astype(np.uint8)
+                new_imgs.append(img)
+        results['imgs'] = new_imgs
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'brightness={self.brightness}, '
+                    f'contrast={self.contrast}, '
+                    f'saturation={self.saturation}, '
+                    f'hue={self.hue})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CenterCrop(RandomCrop):
+    """Crop the center area from images.
+
+    Required keys are "img_shape", "imgs" (optional), "keypoint" (optional),
+    added or modified keys are "imgs", "keypoint", "crop_bbox", "lazy" and
+    "img_shape". Required keys in "lazy" is "crop_bbox", added or modified key
+    is "crop_bbox".
+
+    Args:
+        crop_size (int | tuple[int]): (w, h) of crop size.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self, crop_size, lazy=False):
+        self.crop_size = _pair(crop_size)
+        self.lazy = lazy
+        if not mmengine.is_tuple_of(self.crop_size, int):
+            raise TypeError(f'Crop_size must be int or tuple of int, '
+                            f'but got {type(crop_size)}')
+
+    def transform(self, results):
+        """Performs the CenterCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        if 'keypoint' in results:
+            assert not self.lazy, ('Keypoint Augmentations are not compatible '
+                                   'with lazy == True')
+
+        img_h, img_w = results['img_shape']
+        crop_w, crop_h = self.crop_size
+
+        left = (img_w - crop_w) // 2
+        top = (img_h - crop_h) // 2
+        right = left + crop_w
+        bottom = top + crop_h
+        new_h, new_w = bottom - top, right - left
+
+        crop_bbox = np.array([left, top, right, bottom])
+        results['crop_bbox'] = crop_bbox
+        results['img_shape'] = (new_h, new_w)
+
+        if 'crop_quadruple' not in results:
+            results['crop_quadruple'] = np.array(
+                [0, 0, 1, 1],  # x, y, w, h
+                dtype=np.float32)
+
+        x_ratio, y_ratio = left / img_w, top / img_h
+        w_ratio, h_ratio = new_w / img_w, new_h / img_h
+
+        old_crop_quadruple = results['crop_quadruple']
+        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
+        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
+        new_crop_quadruple = [
+            old_x_ratio + x_ratio * old_w_ratio,
+            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
+            h_ratio * old_h_ratio
+        ]
+        results['crop_quadruple'] = np.array(
+            new_crop_quadruple, dtype=np.float32)
+
+        if not self.lazy:
+            if 'keypoint' in results:
+                results['keypoint'] = self._crop_kps(results['keypoint'],
+                                                     crop_bbox)
+            if 'imgs' in results:
+                results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox)
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+
+            # record crop_bbox in lazyop dict to ensure only crop once in Fuse
+            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
+            left = left * (lazy_right - lazy_left) / img_w
+            right = right * (lazy_right - lazy_left) / img_w
+            top = top * (lazy_bottom - lazy_top) / img_h
+            bottom = bottom * (lazy_bottom - lazy_top) / img_h
+            lazyop['crop_bbox'] = np.array([(lazy_left + left),
+                                            (lazy_top + top),
+                                            (lazy_left + right),
+                                            (lazy_top + bottom)],
+                                           dtype=np.float32)
+
+        if 'gt_bboxes' in results:
+            assert not self.lazy
+            results = self._all_box_crop(results, results['crop_bbox'])
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}(crop_size={self.crop_size}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ThreeCrop(BaseTransform):
+    """Crop images into three crops.
+
+    Crop the images equally into three crops with equal intervals along the
+    shorter side.
+    Required keys are "imgs", "img_shape", added or modified keys are "imgs",
+    "crop_bbox" and "img_shape".
+
+    Args:
+        crop_size(int | tuple[int]): (w, h) of crop size.
+    """
+
+    def __init__(self, crop_size):
+        self.crop_size = _pair(crop_size)
+        if not mmengine.is_tuple_of(self.crop_size, int):
+            raise TypeError(f'Crop_size must be int or tuple of int, '
+                            f'but got {type(crop_size)}')
+
+    def transform(self, results):
+        """Performs the ThreeCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, False)
+        if 'gt_bboxes' in results or 'proposals' in results:
+            warnings.warn('ThreeCrop cannot process bounding boxes')
+
+        imgs = results['imgs']
+        img_h, img_w = results['imgs'][0].shape[:2]
+        crop_w, crop_h = self.crop_size
+        assert crop_h == img_h or crop_w == img_w
+
+        if crop_h == img_h:
+            w_step = (img_w - crop_w) // 2
+            offsets = [
+                (0, 0),  # left
+                (2 * w_step, 0),  # right
+                (w_step, 0),  # middle
+            ]
+        elif crop_w == img_w:
+            h_step = (img_h - crop_h) // 2
+            offsets = [
+                (0, 0),  # top
+                (0, 2 * h_step),  # down
+                (0, h_step),  # middle
+            ]
+
+        cropped = []
+        crop_bboxes = []
+        for x_offset, y_offset in offsets:
+            bbox = [x_offset, y_offset, x_offset + crop_w, y_offset + crop_h]
+            crop = [
+                img[y_offset:y_offset + crop_h, x_offset:x_offset + crop_w]
+                for img in imgs
+            ]
+            cropped.extend(crop)
+            crop_bboxes.extend([bbox for _ in range(len(imgs))])
+
+        crop_bboxes = np.array(crop_bboxes)
+        results['imgs'] = cropped
+        results['crop_bbox'] = crop_bboxes
+        results['img_shape'] = results['imgs'][0].shape[:2]
+
+        return results
+
+    def __repr__(self):
+        repr_str = f'{self.__class__.__name__}(crop_size={self.crop_size})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class TenCrop(BaseTransform):
+    """Crop the images into 10 crops (corner + center + flip).
+
+    Crop the four corners and the center part of the image with the same
+    given crop_size, and flip it horizontally.
+    Required keys are "imgs", "img_shape", added or modified keys are "imgs",
+    "crop_bbox" and "img_shape".
+
+    Args:
+        crop_size(int | tuple[int]): (w, h) of crop size.
+    """
+
+    def __init__(self, crop_size):
+        self.crop_size = _pair(crop_size)
+        if not mmengine.is_tuple_of(self.crop_size, int):
+            raise TypeError(f'Crop_size must be int or tuple of int, '
+                            f'but got {type(crop_size)}')
+
+    def transform(self, results):
+        """Performs the TenCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, False)
+
+        if 'gt_bboxes' in results or 'proposals' in results:
+            warnings.warn('TenCrop cannot process bounding boxes')
+
+        imgs = results['imgs']
+
+        img_h, img_w = results['imgs'][0].shape[:2]
+        crop_w, crop_h = self.crop_size
+
+        w_step = (img_w - crop_w) // 4
+        h_step = (img_h - crop_h) // 4
+
+        offsets = [
+            (0, 0),  # upper left
+            (4 * w_step, 0),  # upper right
+            (0, 4 * h_step),  # lower left
+            (4 * w_step, 4 * h_step),  # lower right
+            (2 * w_step, 2 * h_step),  # center
+        ]
+
+        img_crops = list()
+        crop_bboxes = list()
+        for x_offset, y_offsets in offsets:
+            crop = [
+                img[y_offsets:y_offsets + crop_h, x_offset:x_offset + crop_w]
+                for img in imgs
+            ]
+            flip_crop = [np.flip(c, axis=1).copy() for c in crop]
+            bbox = [x_offset, y_offsets, x_offset + crop_w, y_offsets + crop_h]
+            img_crops.extend(crop)
+            img_crops.extend(flip_crop)
+            crop_bboxes.extend([bbox for _ in range(len(imgs) * 2)])
+
+        crop_bboxes = np.array(crop_bboxes)
+        results['imgs'] = img_crops
+        results['crop_bbox'] = crop_bboxes
+        results['img_shape'] = results['imgs'][0].shape[:2]
+
+        return results
+
+    def __repr__(self):
+        repr_str = f'{self.__class__.__name__}(crop_size={self.crop_size})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomErasing(BaseTransform):
+    """Randomly selects a rectangle region in an image and erase pixels.
+    basically refer mmcls.
+
+    **Required Keys:**
+
+    - img
+
+    **Modified Keys:**
+
+    - img
+
+    Args:
+        erase_prob (float): Probability that image will be randomly erased.
+            Default: 0.5
+        min_area_ratio (float): Minimum erased area / input image area
+            Default: 0.02
+        max_area_ratio (float): Maximum erased area / input image area
+            Default: 1/3
+        aspect_range (sequence | float): Aspect ratio range of erased area.
+            if float, it will be converted to (aspect_ratio, 1/aspect_ratio)
+            Default: (3/10, 10/3)
+        mode (str): Fill method in erased area, can be:
+
+            - const (default): All pixels are assign with the same value.
+            - rand: each pixel is assigned with a random value in [0, 255]
+
+        fill_color (sequence | Number): Base color filled in erased area.
+            Defaults to (128, 128, 128).
+        fill_std (sequence | Number, optional): If set and ``mode`` is 'rand',
+            fill erased area with random color from normal distribution
+            (mean=fill_color, std=fill_std); If not set, fill erased area with
+            random color from uniform distribution (0~255). Defaults to None.
+
+    Note:
+        See `Random Erasing Data Augmentation
+        <https://arxiv.org/pdf/1708.04896.pdf>`_
+
+        This paper provided 4 modes: RE-R, RE-M, RE-0, RE-255, and use RE-M as
+        default. The config of these 4 modes are:
+
+        - RE-R: RandomErasing(mode='rand')
+        - RE-M: RandomErasing(mode='const', fill_color=(123.67, 116.3, 103.5))
+        - RE-0: RandomErasing(mode='const', fill_color=0)
+        - RE-255: RandomErasing(mode='const', fill_color=255)
+    """
+
+    def __init__(self,
+                 erase_prob=0.5,
+                 min_area_ratio=0.02,
+                 max_area_ratio=1 / 3,
+                 aspect_range=(3 / 10, 10 / 3),
+                 mode='const',
+                 fill_color=(128, 128, 128),
+                 fill_std=None):
+        assert isinstance(erase_prob, float) and 0. <= erase_prob <= 1.
+        assert isinstance(min_area_ratio, float) and 0. <= min_area_ratio <= 1.
+        assert isinstance(max_area_ratio, float) and 0. <= max_area_ratio <= 1.
+        assert min_area_ratio <= max_area_ratio, \
+            'min_area_ratio should be smaller than max_area_ratio'
+        if isinstance(aspect_range, float):
+            aspect_range = min(aspect_range, 1 / aspect_range)
+            aspect_range = (aspect_range, 1 / aspect_range)
+        assert isinstance(aspect_range, Sequence) and len(aspect_range) == 2 \
+            and all(isinstance(x, float) for x in aspect_range), \
+            'aspect_range should be a float or Sequence with two float.'
+        assert all(x > 0 for x in aspect_range), \
+            'aspect_range should be positive.'
+        assert aspect_range[0] <= aspect_range[1], \
+            'In aspect_range (min, max), min should be smaller than max.'
+        assert mode in ['const', 'rand'], \
+            'Please select `mode` from ["const", "rand"].'
+        if isinstance(fill_color, Number):
+            fill_color = [fill_color] * 3
+        assert isinstance(fill_color, Sequence) and len(fill_color) == 3 \
+            and all(isinstance(x, Number) for x in fill_color), \
+            'fill_color should be a float or Sequence with three int.'
+        if fill_std is not None:
+            if isinstance(fill_std, Number):
+                fill_std = [fill_std] * 3
+            assert isinstance(fill_std, Sequence) and len(fill_std) == 3 \
+                and all(isinstance(x, Number) for x in fill_std), \
+                'fill_std should be a float or Sequence with three int.'
+
+        self.erase_prob = erase_prob
+        self.min_area_ratio = min_area_ratio
+        self.max_area_ratio = max_area_ratio
+        self.aspect_range = aspect_range
+        self.mode = mode
+        self.fill_color = fill_color
+        self.fill_std = fill_std
+
+    def _img_fill_pixels(self, img, top, left, h, w):
+        """Fill pixels to the patch of image."""
+        if self.mode == 'const':
+            patch = np.empty((h, w, 3), dtype=np.uint8)
+            patch[:, :] = np.array(self.fill_color, dtype=np.uint8)
+        elif self.fill_std is None:
+            # Uniform distribution
+            patch = np.random.uniform(0, 256, (h, w, 3)).astype(np.uint8)
+        else:
+            # Normal distribution
+            patch = np.random.normal(self.fill_color, self.fill_std, (h, w, 3))
+            patch = np.clip(patch.astype(np.int32), 0, 255).astype(np.uint8)
+
+        img[top:top + h, left:left + w] = patch
+        return img
+
+    def _fill_pixels(self, imgs, top, left, h, w):
+        """Fill pixels to the patch of each image in frame clip."""
+        return [self._img_fill_pixels(img, top, left, h, w) for img in imgs]
+
+    @cache_randomness
+    def random_disable(self):
+        """Randomly disable the transform."""
+        return np.random.rand() > self.erase_prob
+
+    @cache_randomness
+    def random_patch(self, img_h, img_w):
+        """Randomly generate patch the erase."""
+        # convert the aspect ratio to log space to equally handle width and
+        # height.
+        log_aspect_range = np.log(
+            np.array(self.aspect_range, dtype=np.float32))
+        aspect_ratio = np.exp(np.random.uniform(*log_aspect_range))
+        area = img_h * img_w
+        area *= np.random.uniform(self.min_area_ratio, self.max_area_ratio)
+
+        h = min(int(round(np.sqrt(area * aspect_ratio))), img_h)
+        w = min(int(round(np.sqrt(area / aspect_ratio))), img_w)
+        top = np.random.randint(0, img_h - h) if img_h > h else 0
+        left = np.random.randint(0, img_w - w) if img_w > w else 0
+        return top, left, h, w
+
+    def transform(self, results):
+        """
+        Args:
+            results (dict): Results dict from pipeline
+
+        Returns:
+            dict: Results after the transformation.
+        """
+        if self.random_disable():
+            return results
+
+        imgs = results['imgs']
+        img_h, img_w = imgs[0].shape[:2]
+
+        imgs = self._fill_pixels(imgs, *self.random_patch(img_h, img_w))
+
+        results['imgs'] = imgs
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(erase_prob={self.erase_prob}, '
+        repr_str += f'min_area_ratio={self.min_area_ratio}, '
+        repr_str += f'max_area_ratio={self.max_area_ratio}, '
+        repr_str += f'aspect_range={self.aspect_range}, '
+        repr_str += f'mode={self.mode}, '
+        repr_str += f'fill_color={self.fill_color}, '
+        repr_str += f'fill_std={self.fill_std})'
+        return repr_str
diff --git a/mmaction/datasets/transforms/text_transforms.py b/mmaction/datasets/transforms/text_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..46b5d982ca17779cfc696432b6e6ffcd6a1d29cb
--- /dev/null
+++ b/mmaction/datasets/transforms/text_transforms.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+from mmcv.transforms import BaseTransform
+
+from mmaction.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class CLIPTokenize(BaseTransform):
+    """Tokenize text and convert to tensor."""
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`CLIPTokenize`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+
+        try:
+            import clip
+        except ImportError:
+            raise ImportError('Please run `pip install '
+                              'git+https://github.com/openai/CLIP.git` '
+                              'to install clip first. ')
+
+        text = results['text']
+        text_tokenized = clip.tokenize(text)[0]
+        results['text'] = text_tokenized
+        return results
diff --git a/mmaction/datasets/transforms/wrappers.py b/mmaction/datasets/transforms/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf41be7f60f23e63624b3c527542f5614f519932
--- /dev/null
+++ b/mmaction/datasets/transforms/wrappers.py
@@ -0,0 +1,380 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+
+import mmengine
+import numpy as np
+from mmcv.transforms import BaseTransform, to_tensor
+from mmengine.utils import digit_version
+
+from mmaction.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class TorchVisionWrapper(BaseTransform):
+    """Torchvision Augmentations, under torchvision.transforms.
+
+    Args:
+        op (str): The name of the torchvision transformation.
+    """
+
+    def __init__(self, op, **kwargs):
+        try:
+            import torchvision
+            import torchvision.transforms as tv_trans
+        except ImportError:
+            raise RuntimeError('Install torchvision to use TorchvisionTrans')
+        if digit_version(torchvision.__version__) < digit_version('0.8.0'):
+            raise RuntimeError('The version of torchvision should be at least '
+                               '0.8.0')
+
+        trans = getattr(tv_trans, op, None)
+        assert trans, f'Transform {op} not in torchvision'
+        self.trans = trans(**kwargs)
+
+    def transform(self, results):
+        """Perform Torchvision augmentations.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        assert 'imgs' in results
+
+        imgs = [x.transpose(2, 0, 1) for x in results['imgs']]
+        imgs = to_tensor(np.stack(imgs))
+
+        imgs = self.trans(imgs).data.numpy()
+        imgs[imgs > 255] = 255
+        imgs[imgs < 0] = 0
+        imgs = imgs.astype(np.uint8)
+        imgs = [x.transpose(1, 2, 0) for x in imgs]
+        results['imgs'] = imgs
+        return results
+
+
+@TRANSFORMS.register_module()
+class PytorchVideoWrapper(BaseTransform):
+    """PytorchVideoTrans Augmentations, under pytorchvideo.transforms.
+
+    Args:
+        op (str): The name of the pytorchvideo transformation.
+    """
+
+    def __init__(self, op, **kwargs):
+        try:
+            import pytorchvideo.transforms as ptv_trans
+            import torch
+        except ImportError:
+            raise RuntimeError('Install pytorchvideo to use PytorchVideoTrans')
+        if digit_version(torch.__version__) < digit_version('1.8.0'):
+            raise RuntimeError(
+                'The version of PyTorch should be at least 1.8.0')
+
+        trans = getattr(ptv_trans, op, None)
+        assert trans, f'Transform {op} not in pytorchvideo'
+
+        supported_pytorchvideo_trans = ('AugMix', 'RandAugment',
+                                        'RandomResizedCrop', 'ShortSideScale',
+                                        'RandomShortSideScale')
+        assert op in supported_pytorchvideo_trans,\
+            f'PytorchVideo Transform {op} is not supported in MMAction2'
+
+        self.trans = trans(**kwargs)
+        self.op = op
+
+    def transform(self, results):
+        """Perform PytorchVideoTrans augmentations.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        assert 'imgs' in results
+
+        assert 'gt_bboxes' not in results,\
+            f'PytorchVideo {self.op} doesn\'t support bboxes yet.'
+        assert 'proposals' not in results,\
+            f'PytorchVideo {self.op} doesn\'t support bboxes yet.'
+
+        if self.op in ('AugMix', 'RandAugment'):
+            # list[ndarray(h, w, 3)] -> torch.tensor(t, c, h, w)
+            imgs = [x.transpose(2, 0, 1) for x in results['imgs']]
+            imgs = to_tensor(np.stack(imgs))
+        else:
+            # list[ndarray(h, w, 3)] -> torch.tensor(c, t, h, w)
+            # uint8 -> float32
+            imgs = to_tensor((np.stack(results['imgs']).transpose(3, 0, 1, 2) /
+                              255.).astype(np.float32))
+
+        imgs = self.trans(imgs).data.numpy()
+
+        if self.op in ('AugMix', 'RandAugment'):
+            imgs[imgs > 255] = 255
+            imgs[imgs < 0] = 0
+            imgs = imgs.astype(np.uint8)
+
+            # torch.tensor(t, c, h, w) -> list[ndarray(h, w, 3)]
+            imgs = [x.transpose(1, 2, 0) for x in imgs]
+        else:
+            # float32 -> uint8
+            imgs = imgs * 255
+            imgs[imgs > 255] = 255
+            imgs[imgs < 0] = 0
+            imgs = imgs.astype(np.uint8)
+
+            # torch.tensor(c, t, h, w) -> list[ndarray(h, w, 3)]
+            imgs = [x for x in imgs.transpose(1, 2, 3, 0)]
+
+        results['imgs'] = imgs
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class ImgAug(BaseTransform):
+    """Imgaug augmentation.
+
+    Adds custom transformations from imgaug library.
+    Please visit `https://imgaug.readthedocs.io/en/latest/index.html`
+    to get more information. Two demo configs could be found in tsn and i3d
+    config folder.
+
+    It's better to use uint8 images as inputs since imgaug works best with
+    numpy dtype uint8 and isn't well tested with other dtypes. It should be
+    noted that not all of the augmenters have the same input and output dtype,
+    which may cause unexpected results.
+
+    Required keys are "imgs", "img_shape"(if "gt_bboxes" is not None) and
+    "modality", added or modified keys are "imgs", "img_shape", "gt_bboxes"
+    and "proposals".
+
+    It is worth mentioning that `Imgaug` will NOT create custom keys like
+    "interpolation", "crop_bbox", "flip_direction", etc. So when using
+    `Imgaug` along with other mmaction2 pipelines, we should pay more attention
+    to required keys.
+
+    Two steps to use `Imgaug` pipeline:
+    1. Create initialization parameter `transforms`. There are three ways
+        to create `transforms`.
+        1) string: only support `default` for now.
+            e.g. `transforms='default'`
+        2) list[dict]: create a list of augmenters by a list of dicts, each
+            dict corresponds to one augmenter. Every dict MUST contain a key
+            named `type`. `type` should be a string(iaa.Augmenter's name) or
+            an iaa.Augmenter subclass.
+            e.g. `transforms=[dict(type='Rotate', rotate=(-20, 20))]`
+            e.g. `transforms=[dict(type=iaa.Rotate, rotate=(-20, 20))]`
+        3) iaa.Augmenter: create an imgaug.Augmenter object.
+            e.g. `transforms=iaa.Rotate(rotate=(-20, 20))`
+    2. Add `Imgaug` in dataset pipeline. It is recommended to insert imgaug
+        pipeline before `Normalize`. A demo pipeline is listed as follows.
+        ```
+        pipeline = [
+            dict(
+                type='SampleFrames',
+                clip_len=1,
+                frame_interval=1,
+                num_clips=16,
+            ),
+            dict(type='RawFrameDecode'),
+            dict(type='Resize', scale=(-1, 256)),
+            dict(
+                type='MultiScaleCrop',
+                input_size=224,
+                scales=(1, 0.875, 0.75, 0.66),
+                random_crop=False,
+                max_wh_scale_gap=1,
+                num_fixed_crops=13),
+            dict(type='Resize', scale=(224, 224), keep_ratio=False),
+            dict(type='Flip', flip_ratio=0.5),
+            dict(type='Imgaug', transforms='default'),
+            # dict(type='Imgaug', transforms=[
+            #     dict(type='Rotate', rotate=(-20, 20))
+            # ]),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='FormatShape', input_format='NCHW'),
+            dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+            dict(type='ToTensor', keys=['imgs', 'label'])
+        ]
+        ```
+
+    Args:
+        transforms (str | list[dict] | :obj:`iaa.Augmenter`): Three different
+            ways to create imgaug augmenter.
+    """
+
+    def __init__(self, transforms):
+        # Hack to fix incompatibility of ImgAug and latest Numpy
+        if digit_version(np.__version__) >= digit_version('1.24.0'):
+            np.bool = bool
+        import imgaug.augmenters as iaa
+
+        if transforms == 'default':
+            self.transforms = self.default_transforms()
+        elif isinstance(transforms, list):
+            assert all(isinstance(trans, dict) for trans in transforms)
+            self.transforms = transforms
+        elif isinstance(transforms, iaa.Augmenter):
+            self.aug = self.transforms = transforms
+        else:
+            raise ValueError('transforms must be `default` or a list of dicts'
+                             ' or iaa.Augmenter object')
+
+        if not isinstance(transforms, iaa.Augmenter):
+            self.aug = iaa.Sequential(
+                [self.imgaug_builder(t) for t in self.transforms])
+
+    @staticmethod
+    def default_transforms():
+        """Default transforms for imgaug.
+
+        Implement RandAugment by imgaug.
+        Please visit `https://arxiv.org/abs/1909.13719` for more information.
+
+        Augmenters and hyper parameters are borrowed from the following repo:
+        https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py # noqa
+
+        Miss one augmenter ``SolarizeAdd`` since imgaug doesn't support this.
+
+        Returns:
+            dict: The constructed RandAugment transforms.
+        """
+        # RandAugment hyper params
+        num_augmenters = 2
+        cur_magnitude, max_magnitude = 9, 10
+        cur_level = 1.0 * cur_magnitude / max_magnitude
+
+        return [
+            dict(
+                type='SomeOf',
+                n=num_augmenters,
+                children=[
+                    dict(
+                        type='ShearX',
+                        shear=17.19 * cur_level * random.choice([-1, 1])),
+                    dict(
+                        type='ShearY',
+                        shear=17.19 * cur_level * random.choice([-1, 1])),
+                    dict(
+                        type='TranslateX',
+                        percent=.2 * cur_level * random.choice([-1, 1])),
+                    dict(
+                        type='TranslateY',
+                        percent=.2 * cur_level * random.choice([-1, 1])),
+                    dict(
+                        type='Rotate',
+                        rotate=30 * cur_level * random.choice([-1, 1])),
+                    dict(type='Posterize', nb_bits=max(1, int(4 * cur_level))),
+                    dict(type='Solarize', threshold=256 * cur_level),
+                    dict(type='EnhanceColor', factor=1.8 * cur_level + .1),
+                    dict(type='EnhanceContrast', factor=1.8 * cur_level + .1),
+                    dict(
+                        type='EnhanceBrightness', factor=1.8 * cur_level + .1),
+                    dict(type='EnhanceSharpness', factor=1.8 * cur_level + .1),
+                    dict(type='Autocontrast', cutoff=0),
+                    dict(type='Equalize'),
+                    dict(type='Invert', p=1.),
+                    dict(
+                        type='Cutout',
+                        nb_iterations=1,
+                        size=0.2 * cur_level,
+                        squared=True)
+                ])
+        ]
+
+    def imgaug_builder(self, cfg):
+        """Import a module from imgaug.
+
+        It follows the logic of :func:`build_from_cfg`. Use a dict object to
+        create an iaa.Augmenter object.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            obj:`iaa.Augmenter`: The constructed imgaug augmenter.
+        """
+        import imgaug.augmenters as iaa
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+
+        obj_type = args.pop('type')
+        if mmengine.is_str(obj_type):
+            obj_cls = getattr(iaa, obj_type) if hasattr(iaa, obj_type) \
+                else getattr(iaa.pillike, obj_type)
+        elif issubclass(obj_type, iaa.Augmenter):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a str or valid type, but got {type(obj_type)}')
+
+        for aug_list_key in ['children', 'then_list', 'else_list']:
+            if aug_list_key in args:
+                args[aug_list_key] = [
+                    self.imgaug_builder(child) for child in args[aug_list_key]
+                ]
+
+        return obj_cls(**args)
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(transforms={self.aug})'
+        return repr_str
+
+    def transform(self, results):
+        """Perform Imgaug augmentations.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        assert results['modality'] == 'RGB', 'Imgaug only support RGB images.'
+        in_type = results['imgs'][0].dtype
+
+        cur_aug = self.aug.to_deterministic()
+
+        results['imgs'] = [
+            cur_aug.augment_image(frame) for frame in results['imgs']
+        ]
+        img_h, img_w, _ = results['imgs'][0].shape
+
+        out_type = results['imgs'][0].dtype
+        assert in_type == out_type, \
+            ('Imgaug input dtype and output dtype are not the same. ',
+             f'Convert from {in_type} to {out_type}')
+
+        if 'gt_bboxes' in results:
+            from imgaug.augmentables import bbs
+            bbox_list = [
+                bbs.BoundingBox(
+                    x1=bbox[0], y1=bbox[1], x2=bbox[2], y2=bbox[3])
+                for bbox in results['gt_bboxes']
+            ]
+            bboxes = bbs.BoundingBoxesOnImage(
+                bbox_list, shape=results['img_shape'])
+            bbox_aug, *_ = cur_aug.augment_bounding_boxes([bboxes])
+            results['gt_bboxes'] = [[
+                max(bbox.x1, 0),
+                max(bbox.y1, 0),
+                min(bbox.x2, img_w),
+                min(bbox.y2, img_h)
+            ] for bbox in bbox_aug.items]
+            if 'proposals' in results:
+                bbox_list = [
+                    bbs.BoundingBox(
+                        x1=bbox[0], y1=bbox[1], x2=bbox[2], y2=bbox[3])
+                    for bbox in results['proposals']
+                ]
+                bboxes = bbs.BoundingBoxesOnImage(
+                    bbox_list, shape=results['img_shape'])
+                bbox_aug, *_ = cur_aug.augment_bounding_boxes([bboxes])
+                results['proposals'] = [[
+                    max(bbox.x1, 0),
+                    max(bbox.y1, 0),
+                    min(bbox.x2, img_w),
+                    min(bbox.y2, img_h)
+                ] for bbox in bbox_aug.items]
+
+        results['img_shape'] = (img_h, img_w)
+
+        return results
diff --git a/mmaction/datasets/video_dataset.py b/mmaction/datasets/video_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac76dd9ae426060c75e395c9477e11d0eca56de4
--- /dev/null
+++ b/mmaction/datasets/video_dataset.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Callable, List, Optional, Union
+
+from mmengine.fileio import exists, list_from_file
+
+from mmaction.registry import DATASETS
+from mmaction.utils import ConfigType
+from .base import BaseActionDataset
+
+
+@DATASETS.register_module()
+class VideoDataset(BaseActionDataset):
+    """Video dataset for action recognition.
+
+    The dataset loads raw videos and apply specified transforms to return a
+    dict containing the frame tensors and other information.
+
+    The ann_file is a text file with multiple lines, and each line indicates
+    a sample video with the filepath and label, which are split with a
+    whitespace. Example of a annotation file:
+
+    .. code-block:: txt
+
+        some/path/000.mp4 1
+        some/path/001.mp4 1
+        some/path/002.mp4 2
+        some/path/003.mp4 2
+        some/path/004.mp4 3
+        some/path/005.mp4 3
+
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        pipeline (List[Union[dict, ConfigDict, Callable]]): A sequence of
+            data transforms.
+        data_prefix (dict or ConfigDict): Path to a directory where videos
+            are held. Defaults to ``dict(video='')``.
+        multi_class (bool): Determines whether the dataset is a multi-class
+            dataset. Defaults to False.
+        num_classes (int, optional): Number of classes of the dataset, used in
+            multi-class datasets. Defaults to None.
+        start_index (int): Specify a start index for frames in consideration of
+            different filename format. However, when taking videos as input,
+            it should be set to 0, since frames loaded from videos count
+            from 0. Defaults to 0.
+        modality (str): Modality of data. Support ``'RGB'``, ``'Flow'``.
+            Defaults to ``'RGB'``.
+        test_mode (bool): Store True when building test or validation dataset.
+            Defaults to False.
+        delimiter (str): Delimiter for the annotation file.
+            Defaults to ``' '`` (whitespace).
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 pipeline: List[Union[dict, Callable]],
+                 data_prefix: ConfigType = dict(video=''),
+                 multi_class: bool = False,
+                 num_classes: Optional[int] = None,
+                 start_index: int = 0,
+                 modality: str = 'RGB',
+                 test_mode: bool = False,
+                 delimiter: str = ' ',
+                 **kwargs) -> None:
+        self.delimiter = delimiter
+        super().__init__(
+            ann_file,
+            pipeline=pipeline,
+            data_prefix=data_prefix,
+            multi_class=multi_class,
+            num_classes=num_classes,
+            start_index=start_index,
+            modality=modality,
+            test_mode=test_mode,
+            **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation file to get video information."""
+        exists(self.ann_file)
+        data_list = []
+        fin = list_from_file(self.ann_file)
+        for line in fin:
+            line_split = line.strip().split(self.delimiter)
+            if self.multi_class:
+                assert self.num_classes is not None
+                filename, label = line_split[0], line_split[1:]
+                label = list(map(int, label))
+            # add fake label for inference datalist without label
+            elif len(line_split) == 1:
+                filename, label = line_split[0], -1
+            else:
+                filename, label = line_split
+                label = int(label)
+            if self.data_prefix['video'] is not None:
+                filename = osp.join(self.data_prefix['video'], filename)
+            data_list.append(dict(filename=filename, label=label))
+        return data_list
diff --git a/mmaction/datasets/video_text_dataset.py b/mmaction/datasets/video_text_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9687bcb9f2ab56e46c4d238e89ea574db1b0a38
--- /dev/null
+++ b/mmaction/datasets/video_text_dataset.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from typing import Dict, List
+
+from mmengine.fileio import exists
+
+from mmaction.registry import DATASETS
+from .base import BaseActionDataset
+
+
+@DATASETS.register_module()
+class VideoTextDataset(BaseActionDataset):
+    """Video dataset for video-text task like video retrieval."""
+
+    def load_data_list(self) -> List[Dict]:
+        """Load annotation file to get video information."""
+        exists(self.ann_file)
+        data_list = []
+
+        with open(self.ann_file) as f:
+            video_dict = json.load(f)
+            for filename, texts in video_dict.items():
+                filename = osp.join(self.data_prefix['video'], filename)
+                video_text_pairs = []
+                for text in texts:
+                    data_item = dict(filename=filename, text=text)
+                    video_text_pairs.append(data_item)
+                data_list.extend(video_text_pairs)
+
+        return data_list
diff --git a/mmaction/engine/__init__.py b/mmaction/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5d572e2dd39b3b65cf58406fcb651d9e31c11a0
--- /dev/null
+++ b/mmaction/engine/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hooks import *  # noqa: F401, F403
+from .model import *  # noqa: F401, F403
+from .optimizers import *  # noqa: F401, F403
+from .runner import *  # noqa: F401, F403
diff --git a/mmaction/engine/hooks/__init__.py b/mmaction/engine/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b2f6895a27ecc90efd91c3b87d365fe780bae32
--- /dev/null
+++ b/mmaction/engine/hooks/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .output import OutputHook
+from .visualization_hook import VisualizationHook
+
+__all__ = ['OutputHook', 'VisualizationHook']
diff --git a/mmaction/engine/hooks/output.py b/mmaction/engine/hooks/output.py
new file mode 100644
index 0000000000000000000000000000000000000000..3744b5b2da6a3c2fc7c08b6afebd295ad10efe78
--- /dev/null
+++ b/mmaction/engine/hooks/output.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import warnings
+
+import torch
+
+
+class OutputHook:
+    """Output feature map of some layers.
+
+    Args:
+        module (nn.Module): The whole module to get layers.
+        outputs (tuple[str] | list[str]): Layer name to output. Default: None.
+        as_tensor (bool): Determine to return a tensor or a numpy array.
+            Default: False.
+    """
+
+    def __init__(self, module, outputs=None, as_tensor=False):
+        self.outputs = outputs
+        self.as_tensor = as_tensor
+        self.layer_outputs = {}
+        self.handles = []
+        self.register(module)
+
+    def register(self, module):
+
+        def hook_wrapper(name):
+
+            def hook(model, input, output):
+                if not isinstance(output, torch.Tensor):
+                    warnings.warn(f'Directly return the output from {name}, '
+                                  f'since it is not a tensor')
+                    self.layer_outputs[name] = output
+                elif self.as_tensor:
+                    self.layer_outputs[name] = output
+                else:
+                    self.layer_outputs[name] = output.detach().cpu().numpy()
+
+            return hook
+
+        if isinstance(self.outputs, (list, tuple)):
+            for name in self.outputs:
+                try:
+                    layer = rgetattr(module, name)
+                    h = layer.register_forward_hook(hook_wrapper(name))
+                except AttributeError:
+                    raise AttributeError(f'Module {name} not found')
+                self.handles.append(h)
+
+    def remove(self):
+        for h in self.handles:
+            h.remove()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.remove()
+
+
+# using wonder's beautiful simplification:
+# https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects
+def rgetattr(obj, attr, *args):
+
+    def _getattr(obj, attr):
+        return getattr(obj, attr, *args)
+
+    return functools.reduce(_getattr, [obj] + attr.split('.'))
diff --git a/mmaction/engine/hooks/visualization_hook.py b/mmaction/engine/hooks/visualization_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..423efeb19d61f84f66b71c50da2e2a641d1b5056
--- /dev/null
+++ b/mmaction/engine/hooks/visualization_hook.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import os.path as osp
+from typing import Optional, Sequence
+
+from mmengine import FileClient
+from mmengine.hooks import Hook
+from mmengine.runner import EpochBasedTrainLoop, Runner
+from mmengine.visualization import Visualizer
+
+from mmaction.registry import HOOKS
+from mmaction.structures import ActionDataSample
+
+
+@HOOKS.register_module()
+class VisualizationHook(Hook):
+    """Classification Visualization Hook. Used to visualize validation and
+    testing prediction results.
+
+    - If ``out_dir`` is specified, all storage backends are ignored
+      and save the image to the ``out_dir``.
+    - If ``show`` is True, plot the result image in a window, please
+      confirm you are able to access the graphical interface.
+    Args:
+        enable (bool): Whether to enable this hook. Defaults to False.
+        interval (int): The interval of samples to visualize. Defaults to 5000.
+        show (bool): Whether to display the drawn image. Defaults to False.
+        out_dir (str, optional): directory where painted images will be saved
+            in the testing process. If None, handle with the backends of the
+            visualizer. Defaults to None.
+        **kwargs: other keyword arguments of
+            :meth:`mmcls.visualization.ClsVisualizer.add_datasample`.
+    """
+
+    def __init__(self,
+                 enable=False,
+                 interval: int = 5000,
+                 show: bool = False,
+                 out_dir: Optional[str] = None,
+                 **kwargs):
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+
+        self.enable = enable
+        self.interval = interval
+        self.show = show
+        self.out_dir = out_dir
+        if out_dir is not None:
+            self.file_client = FileClient.infer_client(uri=out_dir)
+        else:
+            self.file_client = None
+
+        self.draw_args = {**kwargs, 'show': show}
+
+    def _draw_samples(self,
+                      batch_idx: int,
+                      data_batch: dict,
+                      data_samples: Sequence[ActionDataSample],
+                      step: int = 0) -> None:
+        """Visualize every ``self.interval`` samples from a data batch.
+
+        Args:
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`ActionDataSample`]): Outputs from model.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        if self.enable is False:
+            return
+
+        batch_size = len(data_samples)
+        videos = data_batch['inputs']
+        start_idx = batch_size * batch_idx
+        end_idx = start_idx + batch_size
+
+        # The first index divisible by the interval, after the start index
+        first_sample_id = math.ceil(start_idx / self.interval) * self.interval
+
+        for sample_id in range(first_sample_id, end_idx, self.interval):
+            video = videos[sample_id - start_idx]
+            # move channel to the last
+            video = video.permute(1, 2, 3, 0).numpy().astype('uint8')
+
+            data_sample = data_samples[sample_id - start_idx]
+            if 'filename' in data_sample:
+                # osp.basename works on different platforms even file clients.
+                sample_name = osp.basename(data_sample.get('filename'))
+            elif 'frame_dir' in data_sample:
+                sample_name = osp.basename(data_sample.get('frame_dir'))
+            else:
+                sample_name = str(sample_id)
+
+            draw_args = self.draw_args
+            if self.out_dir is not None:
+                draw_args['out_path'] = self.file_client.join_path(
+                    self.out_dir, f'{sample_name}_{step}')
+
+            self._visualizer.add_datasample(
+                sample_name,
+                video=video,
+                data_sample=data_sample,
+                step=step,
+                **self.draw_args,
+            )
+
+    def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                       outputs: Sequence[ActionDataSample]) -> None:
+        """Visualize every ``self.interval`` samples during validation.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`ActionDataSample`]): Outputs from model.
+        """
+        if isinstance(runner.train_loop, EpochBasedTrainLoop):
+            step = runner.epoch
+        else:
+            step = runner.iter
+
+        self._draw_samples(batch_idx, data_batch, outputs, step=step)
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[ActionDataSample]) -> None:
+        """Visualize every ``self.interval`` samples during test.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]): Outputs from model.
+        """
+        self._draw_samples(batch_idx, data_batch, outputs, step=0)
diff --git a/mmaction/engine/model/__init__.py b/mmaction/engine/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52a25d0c72e735bb62283613ab4715e206005782
--- /dev/null
+++ b/mmaction/engine/model/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .weight_init import ConvBranchInit
+
+__all__ = ['ConvBranchInit']
diff --git a/mmaction/engine/model/weight_init.py b/mmaction/engine/model/weight_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..655771031b4f4e7f5a58569918e5c07aee62a775
--- /dev/null
+++ b/mmaction/engine/model/weight_init.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch.nn as nn
+from mmengine.model import BaseInit, update_init_info
+
+from mmaction.registry import WEIGHT_INITIALIZERS
+
+
+def conv_branch_init(conv: nn.Module, branches: int) -> None:
+    """Perform initialization for a conv branch.
+
+    Args:
+        conv (nn.Module): The conv module of a branch.
+        branches (int): The number of branches.
+    """
+
+    weight = conv.weight
+    n = weight.size(0)
+    k1 = weight.size(1)
+    k2 = weight.size(2)
+    nn.init.normal_(weight, 0, math.sqrt(2. / (n * k1 * k2 * branches)))
+    nn.init.constant_(conv.bias, 0)
+
+
+@WEIGHT_INITIALIZERS.register_module('ConvBranch')
+class ConvBranchInit(BaseInit):
+    """Initialize the module parameters of different branches.
+
+    Args:
+        name (str): The name of the target module.
+    """
+
+    def __init__(self, name: str, **kwargs) -> None:
+        super(ConvBranchInit, self).__init__(**kwargs)
+        self.name = name
+
+    def __call__(self, module) -> None:
+        assert hasattr(module, self.name)
+
+        # Take a short cut to get the target module
+        module = getattr(module, self.name)
+        num_subset = len(module)
+        for conv in module:
+            conv_branch_init(conv, num_subset)
+
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self) -> str:
+        info = f'{self.__class__.__name__}'
+        return info
diff --git a/mmaction/engine/optimizers/__init__.py b/mmaction/engine/optimizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..691ea56728c686e724c8e6a730fdb9abee3be09e
--- /dev/null
+++ b/mmaction/engine/optimizers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .layer_decay_optim_wrapper_constructor import \
+    LearningRateDecayOptimizerConstructor
+from .swin_optim_wrapper_constructor import SwinOptimWrapperConstructor
+from .tsm_optim_wrapper_constructor import TSMOptimWrapperConstructor
+
+__all__ = [
+    'TSMOptimWrapperConstructor', 'SwinOptimWrapperConstructor',
+    'LearningRateDecayOptimizerConstructor'
+]
diff --git a/mmaction/engine/optimizers/layer_decay_optim_wrapper_constructor.py b/mmaction/engine/optimizers/layer_decay_optim_wrapper_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5996be7fa01590292f6a617992f448475e922e27
--- /dev/null
+++ b/mmaction/engine/optimizers/layer_decay_optim_wrapper_constructor.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+from typing import List
+
+import torch.nn as nn
+from mmengine.dist import get_dist_info
+from mmengine.logging import MMLogger
+from mmengine.optim import DefaultOptimWrapperConstructor
+
+from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+def get_layer_id_for_vit(var_name: str, max_layer_id: int) -> int:
+    """Get the layer id to set the different learning rates for ViT.
+
+    Args:
+        var_name (str): The key of the model.
+        num_max_layer (int): Maximum number of backbone layers.
+    Returns:
+        int: Returns the layer id of the key.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.patch_embed'):
+        return 0
+    elif var_name.startswith('backbone.blocks'):
+        layer_id = int(var_name.split('.')[2])
+        return layer_id + 1
+    else:
+        return max_layer_id + 1
+
+
+def get_layer_id_for_mvit(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates in ``layer_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_layer_id (int): Maximum layer id.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.patch_embed'):
+        return 0
+    elif var_name.startswith('backbone.blocks'):
+        layer_id = int(var_name.split('.')[2]) + 1
+        return layer_id
+    else:
+        return max_layer_id + 1
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimWrapperConstructor):
+    """
+    Different learning rates are set for different layers of backbone.
+    Note: Currently, this optimizer constructor is built for MViT.
+
+    Inspiration from `the implementation in PySlowFast
+    <https://github.com/facebookresearch/SlowFast>`_ and MMDetection
+    <https://github.com/open-mmlab/mmdetection/tree/dev-3.x>`_
+    """
+
+    def add_params(self, params: List[dict], module: nn.Module,
+                   **kwargs) -> None:
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+        """
+        logger = MMLogger.get_current_instance()
+
+        parameter_groups = {}
+        logger.info(f'self.paramwise_cfg is {self.paramwise_cfg}')
+        num_layers = self.paramwise_cfg.get('num_layers')
+        decay_rate = self.paramwise_cfg.get('decay_rate')
+        decay_type = self.paramwise_cfg.get('decay_type', 'layer_wise')
+        logger.info('Build LearningRateDecayOptimizerConstructor  '
+                    f'{decay_type} {decay_rate} - {num_layers}')
+        weight_decay = self.base_wd
+
+        for m in module.modules():
+            assert not isinstance(m, nn.modules.batchnorm._NormBase
+                                  ), 'BN is not supported with layer decay'
+
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith('.bias'):
+                group_name = 'no_decay'
+                this_weight_decay = 0.
+            else:
+                group_name = 'decay'
+                this_weight_decay = weight_decay
+            if 'layer_wise' in decay_type:
+                if 'MViT' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_mvit(
+                        name, self.paramwise_cfg.get('num_layers'))
+                    logger.info(f'set param {name} as id {layer_id}')
+                elif 'VisionTransformer' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_vit(name, num_layers)
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            else:
+                raise NotImplementedError(f'Only support layer wise decay,'
+                                          f'but got {decay_type}')
+
+            group_name = f'layer_{layer_id}_{group_name}'
+
+            if group_name not in parameter_groups:
+                scale = decay_rate**(num_layers - layer_id + 1)
+
+                parameter_groups[group_name] = {
+                    'weight_decay': this_weight_decay,
+                    'params': [],
+                    'param_names': [],
+                    'lr_scale': scale,
+                    'group_name': group_name,
+                    'lr': scale * self.base_lr,
+                }
+
+            parameter_groups[group_name]['params'].append(param)
+            parameter_groups[group_name]['param_names'].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    'param_names': parameter_groups[key]['param_names'],
+                    'lr_scale': parameter_groups[key]['lr_scale'],
+                    'lr': parameter_groups[key]['lr'],
+                    'weight_decay': parameter_groups[key]['weight_decay'],
+                }
+            logger.info(f'Param groups = {json.dumps(to_display, indent=2)}')
+        params.extend(parameter_groups.values())
diff --git a/mmaction/engine/optimizers/swin_optim_wrapper_constructor.py b/mmaction/engine/optimizers/swin_optim_wrapper_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8515c58466b2b30e9dc8c562a0c284ada2d6450e
--- /dev/null
+++ b/mmaction/engine/optimizers/swin_optim_wrapper_constructor.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import reduce
+from operator import mul
+from typing import List
+
+import torch.nn as nn
+from mmengine.logging import print_log
+from mmengine.optim import DefaultOptimWrapperConstructor
+
+from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class SwinOptimWrapperConstructor(DefaultOptimWrapperConstructor):
+
+    def add_params(self,
+                   params: List[dict],
+                   module: nn.Module,
+                   prefix: str = 'base',
+                   **kwargs) -> None:
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module. Defaults to ``'base'``.
+        """
+        for name, param in module.named_parameters(recurse=False):
+            param_group = {'params': [param]}
+            if not param.requires_grad:
+                params.append(param_group)
+                continue
+
+            param_group['lr'] = self.base_lr
+            if self.base_wd is not None:
+                param_group['weight_decay'] = self.base_wd
+
+            processing_keys = [
+                key for key in self.paramwise_cfg if key in f'{prefix}.{name}'
+            ]
+            if processing_keys:
+                param_group['lr'] *= \
+                    reduce(mul, [self.paramwise_cfg[key].get('lr_mult', 1.)
+                                 for key in processing_keys])
+                if self.base_wd is not None:
+                    param_group['weight_decay'] *= \
+                        reduce(mul, [self.paramwise_cfg[key].
+                               get('decay_mult', 1.)
+                                     for key in processing_keys])
+
+            params.append(param_group)
+
+            for key, value in param_group.items():
+                if key == 'params':
+                    continue
+                full_name = f'{prefix}.{name}' if prefix else name
+                print_log(
+                    f'paramwise_options -- '
+                    f'{full_name}: {key} = {round(value, 8)}',
+                    logger='current')
+
+        for child_name, child_mod in module.named_children():
+            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
+            self.add_params(params, child_mod, prefix=child_prefix)
diff --git a/mmaction/engine/optimizers/tsm_optim_wrapper_constructor.py b/mmaction/engine/optimizers/tsm_optim_wrapper_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f197d9941f7cd2009db3e52d45e3821e1c5f355
--- /dev/null
+++ b/mmaction/engine/optimizers/tsm_optim_wrapper_constructor.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.optim import DefaultOptimWrapperConstructor
+from mmengine.utils.dl_utils.parrots_wrapper import (SyncBatchNorm_,
+                                                     _BatchNorm, _ConvNd)
+
+from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class TSMOptimWrapperConstructor(DefaultOptimWrapperConstructor):
+    """Optimizer constructor in TSM model.
+
+    This constructor builds optimizer in different ways from the default one.
+
+    1. Parameters of the first conv layer have default lr and weight decay.
+    2. Parameters of BN layers have default lr and zero weight decay.
+    3. If the field "fc_lr5" in paramwise_cfg is set to True, the parameters
+       of the last fc layer in cls_head have 5x lr multiplier and 10x weight
+       decay multiplier.
+    4. Weights of other layers have default lr and weight decay, and biases
+       have a 2x lr multiplier and zero weight decay.
+    """
+
+    def add_params(self, params, model, **kwargs):
+        """Add parameters and their corresponding lr and wd to the params.
+
+        Args:
+            params (list): The list to be modified, containing all parameter
+                groups and their corresponding lr and wd configurations.
+            model (nn.Module): The model to be trained with the optimizer.
+        """
+        # use fc_lr5 to determine whether to specify higher multi-factor
+        # for fc layer weights and bias.
+        fc_lr5 = self.paramwise_cfg['fc_lr5']
+        first_conv_weight = []
+        first_conv_bias = []
+        normal_weight = []
+        normal_bias = []
+        lr5_weight = []
+        lr10_bias = []
+        bn = []
+
+        conv_cnt = 0
+
+        for m in model.modules():
+            if isinstance(m, _ConvNd):
+                m_params = list(m.parameters())
+                conv_cnt += 1
+                if conv_cnt == 1:
+                    first_conv_weight.append(m_params[0])
+                    if len(m_params) == 2:
+                        first_conv_bias.append(m_params[1])
+                else:
+                    normal_weight.append(m_params[0])
+                    if len(m_params) == 2:
+                        normal_bias.append(m_params[1])
+            elif isinstance(m, torch.nn.Linear):
+                m_params = list(m.parameters())
+                normal_weight.append(m_params[0])
+                if len(m_params) == 2:
+                    normal_bias.append(m_params[1])
+            elif isinstance(m,
+                            (_BatchNorm, SyncBatchNorm_, torch.nn.GroupNorm)):
+                for param in list(m.parameters()):
+                    if param.requires_grad:
+                        bn.append(param)
+            elif len(m._modules) == 0:
+                if len(list(m.parameters())) > 0:
+                    raise ValueError(f'New atomic module type: {type(m)}. '
+                                     'Need to give it a learning policy')
+
+        # pop the cls_head fc layer params
+        last_fc_weight = normal_weight.pop()
+        last_fc_bias = normal_bias.pop()
+        if fc_lr5:
+            lr5_weight.append(last_fc_weight)
+            lr10_bias.append(last_fc_bias)
+        else:
+            normal_weight.append(last_fc_weight)
+            normal_bias.append(last_fc_bias)
+
+        params.append({
+            'params': first_conv_weight,
+            'lr': self.base_lr,
+            'weight_decay': self.base_wd
+        })
+        params.append({
+            'params': first_conv_bias,
+            'lr': self.base_lr * 2,
+            'weight_decay': 0
+        })
+        params.append({
+            'params': normal_weight,
+            'lr': self.base_lr,
+            'weight_decay': self.base_wd
+        })
+        params.append({
+            'params': normal_bias,
+            'lr': self.base_lr * 2,
+            'weight_decay': 0
+        })
+        params.append({'params': bn, 'lr': self.base_lr, 'weight_decay': 0})
+        params.append({
+            'params': lr5_weight,
+            'lr': self.base_lr * 5,
+            'weight_decay': self.base_wd
+        })
+        params.append({
+            'params': lr10_bias,
+            'lr': self.base_lr * 10,
+            'weight_decay': 0
+        })
diff --git a/mmaction/engine/runner/__init__.py b/mmaction/engine/runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bc7841840bb3b35848736cebf64372053e12e7b
--- /dev/null
+++ b/mmaction/engine/runner/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .multi_loop import MultiLoaderEpochBasedTrainLoop
+from .retrieval_loop import RetrievalTestLoop, RetrievalValLoop
+
+__all__ = [
+    'MultiLoaderEpochBasedTrainLoop', 'RetrievalValLoop', 'RetrievalTestLoop'
+]
diff --git a/mmaction/engine/runner/multi_loop.py b/mmaction/engine/runner/multi_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..466d625c6f11cb7bb3c013f097568d3eaad62762
--- /dev/null
+++ b/mmaction/engine/runner/multi_loop.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import gc
+from typing import Dict, List, Union
+
+from mmengine.runner import EpochBasedTrainLoop
+from torch.utils.data import DataLoader
+
+from mmaction.registry import LOOPS
+
+
+class EpochMultiLoader:
+    """Multi loaders based on epoch."""
+
+    def __init__(self, dataloaders: List[DataLoader]):
+        self._dataloaders = dataloaders
+        self.iter_loaders = [iter(loader) for loader in self._dataloaders]
+
+    @property
+    def num_loaders(self):
+        """The number of dataloaders."""
+        return len(self._dataloaders)
+
+    def __iter__(self):
+        """Return self when executing __iter__."""
+        return self
+
+    def __next__(self):
+        """Get the next iter's data of multiple loaders."""
+        data = tuple([next(loader) for loader in self.iter_loaders])
+        return data
+
+    def __len__(self):
+        """Get the length of loader."""
+        return min([len(loader) for loader in self._dataloaders])
+
+
+@LOOPS.register_module()
+class MultiLoaderEpochBasedTrainLoop(EpochBasedTrainLoop):
+    """EpochBasedTrainLoop with multiple dataloaders.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or Dict): A dataloader object or a dict to
+            build a dataloader for training the model.
+        other_loaders (List of Dataloader or Dict): A list of other loaders.
+            Each item in the list is a dataloader object or a dict to build
+            a dataloader.
+        max_epochs (int): Total training epochs.
+        val_begin (int): The epoch that begins validating. Defaults to 1.
+        val_interval (int): Validation interval. Defaults to 1.
+    """
+
+    def __init__(self,
+                 runner,
+                 dataloader: Union[Dict, DataLoader],
+                 other_loaders: List[Union[Dict, DataLoader]],
+                 max_epochs: int,
+                 val_begin: int = 1,
+                 val_interval: int = 1) -> None:
+        super().__init__(runner, dataloader, max_epochs, val_begin,
+                         val_interval)
+        multi_loaders = [self.dataloader]
+        for loader in other_loaders:
+            if isinstance(loader, dict):
+                loader = runner.build_dataloader(loader, seed=runner.seed)
+            multi_loaders.append(loader)
+
+        self.multi_loaders = multi_loaders
+
+    def run_epoch(self) -> None:
+        """Iterate one epoch."""
+        self.runner.call_hook('before_train_epoch')
+        self.runner.model.train()
+
+        gc.collect()
+        for loader in self.multi_loaders:
+            if hasattr(loader, 'sampler') and hasattr(loader.sampler,
+                                                      'set_epoch'):
+                loader.sampler.set_epoch(self._epoch)
+
+        for idx, data_batch in enumerate(EpochMultiLoader(self.multi_loaders)):
+            self.run_iter(idx, data_batch)
+
+        self.runner.call_hook('after_train_epoch')
+        self._epoch += 1
diff --git a/mmaction/engine/runner/retrieval_loop.py b/mmaction/engine/runner/retrieval_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..db829bcecef43c34a3dc2a16b7bdc5684fe8c375
--- /dev/null
+++ b/mmaction/engine/runner/retrieval_loop.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+from mmengine.model import is_model_wrapper
+from mmengine.runner import TestLoop, ValLoop, autocast
+
+from mmaction.registry import LOOPS
+
+
+@LOOPS.register_module()
+class RetrievalValLoop(ValLoop):
+    """Loop for multimodal retrieval val.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): A dataloader object or a dict to
+            build a dataloader.
+        evaluator (Evaluator or dict or list): Used for computing metrics.
+        fp16 (bool): Whether to enable fp16 valing. Defaults to
+            False.
+    """
+
+    def run(self) -> dict:
+        """Launch val."""
+        self.runner.call_hook('before_val')
+        self.runner.call_hook('before_val_epoch')
+        self.runner.model.eval()
+
+        feats_local = []
+        data_samples_local = []
+
+        for idx, data_batch in enumerate(self.dataloader):
+            with torch.no_grad():
+                self.runner.call_hook(
+                    'before_val_iter', batch_idx=idx, data_batch=data_batch)
+                # predictions should be sequence of BaseDataElement
+                with autocast(enabled=self.fp16):
+                    if is_model_wrapper(self.runner.model):
+                        data_preprocessor = self.runner.model.module.data_preprocessor  # noqa: E501
+                    else:
+                        data_preprocessor = self.runner.model.data_preprocessor
+
+                    # get features for retrieval instead of data samples
+                    data_batch = data_preprocessor(data_batch, False)
+                    feats = self.runner.model._run_forward(
+                        data_batch, mode='tensor')
+                    feats_local.append(feats)
+                    data_samples_local.extend(data_batch['data_samples'])
+                self.runner.call_hook(
+                    'after_val_iter',
+                    batch_idx=idx,
+                    data_batch=data_batch,
+                    outputs=feats)
+
+        # concatenate different features
+        feats_local = {
+            k: torch.cat([dic[k] for dic in feats_local])
+            for k in feats_local[0]
+        }
+
+        # get predictions
+        if is_model_wrapper(self.runner.model):
+            predict_all_fn = self.runner.model.module.predict_all
+        else:
+            predict_all_fn = self.runner.model.predict_all
+
+        num_videos = self.dataloader.dataset.num_videos
+        num_texts = self.dataloader.dataset.num_texts
+        with torch.no_grad():
+            with autocast(enabled=self.fp16):
+                i2t_data_samples, t2i_data_samples = predict_all_fn(
+                    feats_local,
+                    data_samples_local,
+                    num_images=num_videos,
+                    num_texts=num_texts,
+                )
+        # process in evaluator and compute metrics
+        self.evaluator.process(i2t_data_samples, None)
+        i2t_metrics = self.evaluator.evaluate(num_videos)
+        i2t_metrics = {f'i2t/{k}': v for k, v in i2t_metrics.items()}
+        self.evaluator.process(t2i_data_samples, None)
+        t2i_metrics = self.evaluator.evaluate(num_texts)
+        t2i_metrics = {f't2i/{k}': v for k, v in t2i_metrics.items()}
+        metrics = {**i2t_metrics, **t2i_metrics}
+        self.runner.call_hook('after_val_epoch', metrics=metrics)
+        self.runner.call_hook('after_val')
+        return metrics
+
+
+@LOOPS.register_module()
+class RetrievalTestLoop(TestLoop):
+    """Loop for multimodal retrieval test.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): A dataloader object or a dict to
+            build a dataloader.
+        evaluator (Evaluator or dict or list): Used for computing metrics.
+        fp16 (bool): Whether to enable fp16 testing. Defaults to
+            False.
+    """
+
+    def run(self) -> dict:
+        """Launch test."""
+        self.runner.call_hook('before_test')
+        self.runner.call_hook('before_test_epoch')
+        self.runner.model.eval()
+
+        feats_local = []
+        data_samples_local = []
+
+        for idx, data_batch in enumerate(self.dataloader):
+            with torch.no_grad():
+                self.runner.call_hook(
+                    'before_test_iter', batch_idx=idx, data_batch=data_batch)
+                # predictions should be sequence of BaseDataElement
+                with autocast(enabled=self.fp16):
+                    if is_model_wrapper(self.runner.model):
+                        data_preprocessor = self.runner.model.module.data_preprocessor  # noqa: E501
+                    else:
+                        data_preprocessor = self.runner.model.data_preprocessor
+                    # get features for retrieval instead of data samples
+                    data_batch = data_preprocessor(data_batch, False)
+                    feats = self.runner.model._run_forward(
+                        data_batch, mode='tensor')
+                    feats_local.append(feats)
+                    data_samples_local.extend(data_batch['data_samples'])
+                self.runner.call_hook(
+                    'after_test_iter',
+                    batch_idx=idx,
+                    data_batch=data_batch,
+                    outputs=feats)
+
+        # concatenate different features
+        feats_local = {
+            k: torch.cat([dic[k] for dic in feats_local])
+            for k in feats_local[0]
+        }
+
+        # get predictions
+        if is_model_wrapper(self.runner.model):
+            predict_all_fn = self.runner.model.module.predict_all
+        else:
+            predict_all_fn = self.runner.model.predict_all
+
+        num_videos = self.dataloader.dataset.num_videos
+        num_texts = self.dataloader.dataset.num_texts
+        with torch.no_grad():
+            with autocast(enabled=self.fp16):
+                i2t_data_samples, t2i_data_samples = predict_all_fn(
+                    feats_local,
+                    data_samples_local,
+                    num_images=num_videos,
+                    num_texts=num_texts,
+                )
+
+        # process in evaluator and compute metrics
+        self.evaluator.process(i2t_data_samples, None)
+        i2t_metrics = self.evaluator.evaluate(num_videos)
+        i2t_metrics = {f'i2t/{k}': v for k, v in i2t_metrics.items()}
+        self.evaluator.process(t2i_data_samples, None)
+        t2i_metrics = self.evaluator.evaluate(num_texts)
+        t2i_metrics = {f't2i/{k}': v for k, v in t2i_metrics.items()}
+        metrics = {**i2t_metrics, **t2i_metrics}
+
+        self.runner.call_hook('after_test_epoch', metrics=metrics)
+        self.runner.call_hook('after_test')
+        return metrics
diff --git a/mmaction/evaluation/__init__.py b/mmaction/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf038e034f065673cab674735be2ab9102d3eba5
--- /dev/null
+++ b/mmaction/evaluation/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .functional import *  # noqa: F401,F403
+from .metrics import *  # noqa: F401,F403
diff --git a/mmaction/evaluation/__pycache__/__init__.cpython-312.pyc b/mmaction/evaluation/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..823145a3540a1c54c77a9d74e0d4949d1f1b9ffd
Binary files /dev/null and b/mmaction/evaluation/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/evaluation/functional/__init__.py b/mmaction/evaluation/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e6d5756d466d2bf3669e994c6f5dfcc33a5b9f
--- /dev/null
+++ b/mmaction/evaluation/functional/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .accuracy import (average_precision_at_temporal_iou,
+                       average_recall_at_avg_proposals, confusion_matrix,
+                       get_weighted_score, interpolated_precision_recall,
+                       mean_average_precision, mean_class_accuracy,
+                       mmit_mean_average_precision, pairwise_temporal_iou,
+                       softmax, top_k_accuracy, top_k_classes)
+from .ava_utils import ava_eval, read_labelmap, results2csv
+from .eval_detection import ActivityNetLocalization
+from .multisports_utils import frameAP, link_tubes, videoAP, videoAP_all
+
+__all__ = [
+    'top_k_accuracy', 'mean_class_accuracy', 'confusion_matrix',
+    'mean_average_precision', 'get_weighted_score',
+    'average_recall_at_avg_proposals', 'pairwise_temporal_iou',
+    'average_precision_at_temporal_iou', 'ActivityNetLocalization', 'softmax',
+    'interpolated_precision_recall', 'mmit_mean_average_precision',
+    'top_k_classes', 'read_labelmap', 'ava_eval', 'results2csv', 'frameAP',
+    'videoAP', 'link_tubes', 'videoAP_all'
+]
diff --git a/mmaction/evaluation/functional/__pycache__/__init__.cpython-312.pyc b/mmaction/evaluation/functional/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cefd2d78e7897275fb61cff8d672a6411bc6565b
Binary files /dev/null and b/mmaction/evaluation/functional/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/evaluation/functional/__pycache__/accuracy.cpython-312.pyc b/mmaction/evaluation/functional/__pycache__/accuracy.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f56fbf34db8851c82cf39c1b666783425c93bfb8
Binary files /dev/null and b/mmaction/evaluation/functional/__pycache__/accuracy.cpython-312.pyc differ
diff --git a/mmaction/evaluation/functional/__pycache__/ava_utils.cpython-312.pyc b/mmaction/evaluation/functional/__pycache__/ava_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3a0e96cd696e81ec6de3e2225b9353adad77fb3
Binary files /dev/null and b/mmaction/evaluation/functional/__pycache__/ava_utils.cpython-312.pyc differ
diff --git a/mmaction/evaluation/functional/__pycache__/eval_detection.cpython-312.pyc b/mmaction/evaluation/functional/__pycache__/eval_detection.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7ac06f073ce8dbe3b8f1b593983c901adcbfeff
Binary files /dev/null and b/mmaction/evaluation/functional/__pycache__/eval_detection.cpython-312.pyc differ
diff --git a/mmaction/evaluation/functional/__pycache__/multisports_utils.cpython-312.pyc b/mmaction/evaluation/functional/__pycache__/multisports_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a6ba3c3bf05e7fbcd39d8febf4252bba57a06b0
Binary files /dev/null and b/mmaction/evaluation/functional/__pycache__/multisports_utils.cpython-312.pyc differ
diff --git a/mmaction/evaluation/functional/accuracy.py b/mmaction/evaluation/functional/accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..d033a1102af03c042de61e497215176409a0f1d0
--- /dev/null
+++ b/mmaction/evaluation/functional/accuracy.py
@@ -0,0 +1,568 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+
+def confusion_matrix(y_pred, y_real, normalize=None):
+    """Compute confusion matrix.
+
+    Args:
+        y_pred (list[int] | np.ndarray[int]): Prediction labels.
+        y_real (list[int] | np.ndarray[int]): Ground truth labels.
+        normalize (str | None): Normalizes confusion matrix over the true
+            (rows), predicted (columns) conditions or all the population.
+            If None, confusion matrix will not be normalized. Options are
+            "true", "pred", "all", None. Default: None.
+
+    Returns:
+        np.ndarray: Confusion matrix.
+    """
+    if normalize not in ['true', 'pred', 'all', None]:
+        raise ValueError("normalize must be one of {'true', 'pred', "
+                         "'all', None}")
+
+    if isinstance(y_pred, list):
+        y_pred = np.array(y_pred)
+        if y_pred.dtype == np.int32:
+            y_pred = y_pred.astype(np.int64)
+    if not isinstance(y_pred, np.ndarray):
+        raise TypeError(
+            f'y_pred must be list or np.ndarray, but got {type(y_pred)}')
+    if not y_pred.dtype == np.int64:
+        raise TypeError(
+            f'y_pred dtype must be np.int64, but got {y_pred.dtype}')
+
+    if isinstance(y_real, list):
+        y_real = np.array(y_real)
+        if y_real.dtype == np.int32:
+            y_real = y_real.astype(np.int64)
+    if not isinstance(y_real, np.ndarray):
+        raise TypeError(
+            f'y_real must be list or np.ndarray, but got {type(y_real)}')
+    if not y_real.dtype == np.int64:
+        raise TypeError(
+            f'y_real dtype must be np.int64, but got {y_real.dtype}')
+
+    label_set = np.unique(np.concatenate((y_pred, y_real)))
+    num_labels = len(label_set)
+    max_label = label_set[-1]
+    label_map = np.zeros(max_label + 1, dtype=np.int64)
+    for i, label in enumerate(label_set):
+        label_map[label] = i
+
+    y_pred_mapped = label_map[y_pred]
+    y_real_mapped = label_map[y_real]
+
+    confusion_mat = np.bincount(
+        num_labels * y_real_mapped + y_pred_mapped,
+        minlength=num_labels**2).reshape(num_labels, num_labels)
+
+    with np.errstate(all='ignore'):
+        if normalize == 'true':
+            confusion_mat = (
+                confusion_mat / confusion_mat.sum(axis=1, keepdims=True))
+        elif normalize == 'pred':
+            confusion_mat = (
+                confusion_mat / confusion_mat.sum(axis=0, keepdims=True))
+        elif normalize == 'all':
+            confusion_mat = (confusion_mat / confusion_mat.sum())
+        confusion_mat = np.nan_to_num(confusion_mat)
+
+    return confusion_mat
+
+
+def mean_class_accuracy(scores, labels):
+    """Calculate mean class accuracy.
+
+    Args:
+        scores (list[np.ndarray]): Prediction scores for each class.
+        labels (list[int]): Ground truth labels.
+
+    Returns:
+        np.ndarray: Mean class accuracy.
+    """
+    pred = np.argmax(scores, axis=1)
+    cf_mat = confusion_matrix(pred, labels).astype(float)
+
+    cls_cnt = cf_mat.sum(axis=1)
+    cls_hit = np.diag(cf_mat)
+
+    mean_class_acc = np.mean(
+        [hit / cnt if cnt else 0.0 for cnt, hit in zip(cls_cnt, cls_hit)])
+
+    return mean_class_acc
+
+
+def top_k_classes(scores, labels, k=10, mode='accurate'):
+    """Calculate the most K accurate (inaccurate) classes.
+
+    Given the prediction scores, ground truth label and top-k value,
+    compute the top K accurate (inaccurate) classes.
+
+    Args:
+        scores (list[np.ndarray]): Prediction scores for each class.
+        labels (list[int] | np.ndarray): Ground truth labels.
+        k (int): Top-k values. Default: 10.
+        mode (str): Comparison mode for Top-k. Options are 'accurate'
+            and 'inaccurate'. Default: 'accurate'.
+
+    Return:
+        list: List of sorted (from high accuracy to low accuracy for
+            'accurate' mode, and from low accuracy to high accuracy for
+            inaccurate mode) top K classes in format of (label_id,
+            acc_ratio).
+    """
+    assert mode in ['accurate', 'inaccurate']
+    pred = np.argmax(scores, axis=1)
+    cf_mat = confusion_matrix(pred, labels).astype(float)
+
+    cls_cnt = cf_mat.sum(axis=1)
+    cls_hit = np.diag(cf_mat)
+    hit_ratio = np.array(
+        [hit / cnt if cnt else 0.0 for cnt, hit in zip(cls_cnt, cls_hit)])
+
+    if mode == 'accurate':
+        max_index = np.argsort(hit_ratio)[-k:][::-1]
+        max_value = hit_ratio[max_index]
+        results = list(zip(max_index, max_value))
+    else:
+        min_index = np.argsort(hit_ratio)[:k]
+        min_value = hit_ratio[min_index]
+        results = list(zip(min_index, min_value))
+    return results
+
+
+def top_k_accuracy(scores, labels, topk=(1, )):
+    """Calculate top k accuracy score.
+
+    Args:
+        scores (list[np.ndarray]): Prediction scores for each class.
+        labels (list[int]): Ground truth labels.
+        topk (tuple[int]): K value for top_k_accuracy. Default: (1, ).
+
+    Returns:
+        list[float]: Top k accuracy score for each k.
+    """
+    res = []
+    labels = np.array(labels)[:, np.newaxis]
+    for k in topk:
+        max_k_preds = np.argsort(scores, axis=1)[:, -k:][:, ::-1]
+        match_array = np.logical_or.reduce(max_k_preds == labels, axis=1)
+        topk_acc_score = match_array.sum() / match_array.shape[0]
+        res.append(topk_acc_score)
+
+    return res
+
+
+def mmit_mean_average_precision(scores, labels):
+    """Mean average precision for multi-label recognition. Used for reporting
+    MMIT style mAP on Multi-Moments in Times. The difference is that this
+    method calculates average-precision for each sample and averages them among
+    samples.
+
+    Args:
+        scores (list[np.ndarray]): Prediction scores of different classes for
+            each sample.
+        labels (list[np.ndarray]): Ground truth many-hot vector for each
+            sample.
+
+    Returns:
+        np.float64: The MMIT style mean average precision.
+    """
+    results = []
+    for score, label in zip(scores, labels):
+        precision, recall, _ = binary_precision_recall_curve(score, label)
+        ap = -np.sum(np.diff(recall) * np.array(precision)[:-1])
+        results.append(ap)
+    return np.mean(results)
+
+
+def mean_average_precision(scores, labels):
+    """Mean average precision for multi-label recognition.
+
+    Args:
+        scores (list[np.ndarray]): Prediction scores of different classes for
+            each sample.
+        labels (list[np.ndarray]): Ground truth many-hot vector for each
+            sample.
+
+    Returns:
+        np.float64: The mean average precision.
+    """
+    results = []
+    scores = np.stack(scores).T
+    labels = np.stack(labels).T
+
+    for score, label in zip(scores, labels):
+        precision, recall, _ = binary_precision_recall_curve(score, label)
+        ap = -np.sum(np.diff(recall) * np.array(precision)[:-1])
+        results.append(ap)
+    results = [x for x in results if not np.isnan(x)]
+    if results == []:
+        return np.nan
+    return np.mean(results)
+
+
+def binary_precision_recall_curve(y_score, y_true):
+    """Calculate the binary precision recall curve at step thresholds.
+
+    Args:
+        y_score (np.ndarray): Prediction scores for each class.
+            Shape should be (num_classes, ).
+        y_true (np.ndarray): Ground truth many-hot vector.
+            Shape should be (num_classes, ).
+
+    Returns:
+        precision (np.ndarray): The precision of different thresholds.
+        recall (np.ndarray): The recall of different thresholds.
+        thresholds (np.ndarray): Different thresholds at which precision and
+            recall are tested.
+    """
+    assert isinstance(y_score, np.ndarray)
+    assert isinstance(y_true, np.ndarray)
+    assert y_score.shape == y_true.shape
+
+    # make y_true a boolean vector
+    y_true = (y_true == 1)
+    # sort scores and corresponding truth values
+    desc_score_indices = np.argsort(y_score, kind='mergesort')[::-1]
+    y_score = y_score[desc_score_indices]
+    y_true = y_true[desc_score_indices]
+    # There may be ties in values, therefore find the `distinct_value_inds`
+    distinct_value_inds = np.where(np.diff(y_score))[0]
+    threshold_inds = np.r_[distinct_value_inds, y_true.size - 1]
+    # accumulate the true positives with decreasing threshold
+    tps = np.cumsum(y_true)[threshold_inds]
+    fps = 1 + threshold_inds - tps
+    thresholds = y_score[threshold_inds]
+
+    precision = tps / (tps + fps)
+    precision[np.isnan(precision)] = 0
+    recall = tps / tps[-1]
+    # stop when full recall attained
+    # and reverse the outputs so recall is decreasing
+    last_ind = tps.searchsorted(tps[-1])
+    sl = slice(last_ind, None, -1)
+
+    return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]
+
+
+def pairwise_temporal_iou(candidate_segments,
+                          target_segments,
+                          calculate_overlap_self=False):
+    """Compute intersection over union between segments.
+
+    Args:
+        candidate_segments (np.ndarray): 1-dim/2-dim array in format
+            ``[init, end]/[m x 2:=[init, end]]``.
+        target_segments (np.ndarray): 2-dim array in format
+            ``[n x 2:=[init, end]]``.
+        calculate_overlap_self (bool): Whether to calculate overlap_self
+            (union / candidate_length) or not. Default: False.
+
+    Returns:
+        t_iou (np.ndarray): 1-dim array [n] /
+            2-dim array [n x m] with IoU ratio.
+        t_overlap_self (np.ndarray, optional): 1-dim array [n] /
+            2-dim array [n x m] with overlap_self, returns when
+            calculate_overlap_self is True.
+    """
+    candidate_segments_ndim = candidate_segments.ndim
+    if target_segments.ndim != 2 or candidate_segments_ndim not in [1, 2]:
+        raise ValueError('Dimension of arguments is incorrect')
+
+    if candidate_segments_ndim == 1:
+        candidate_segments = candidate_segments[np.newaxis, :]
+
+    n, m = target_segments.shape[0], candidate_segments.shape[0]
+    t_iou = np.empty((n, m), dtype=np.float32)
+    if calculate_overlap_self:
+        t_overlap_self = np.empty((n, m), dtype=np.float32)
+
+    for i in range(m):
+        candidate_segment = candidate_segments[i, :]
+        tt1 = np.maximum(candidate_segment[0], target_segments[:, 0])
+        tt2 = np.minimum(candidate_segment[1], target_segments[:, 1])
+        # Intersection including Non-negative overlap score.
+        segments_intersection = (tt2 - tt1).clip(0)
+        # Segment union.
+        segments_union = ((target_segments[:, 1] - target_segments[:, 0]) +
+                          (candidate_segment[1] - candidate_segment[0]) -
+                          segments_intersection)
+        # Compute overlap as the ratio of the intersection
+        # over union of two segments.
+        t_iou[:, i] = (segments_intersection.astype(float) / segments_union)
+        if calculate_overlap_self:
+            candidate_length = candidate_segment[1] - candidate_segment[0]
+            t_overlap_self[:, i] = (
+                segments_intersection.astype(float) / candidate_length)
+
+    if candidate_segments_ndim == 1:
+        t_iou = np.squeeze(t_iou, axis=1)
+    if calculate_overlap_self:
+        if candidate_segments_ndim == 1:
+            t_overlap_self = np.squeeze(t_overlap_self, axis=1)
+        return t_iou, t_overlap_self
+
+    return t_iou
+
+
+def average_recall_at_avg_proposals(ground_truth,
+                                    proposals,
+                                    total_num_proposals,
+                                    max_avg_proposals=None,
+                                    temporal_iou_thresholds=np.linspace(
+                                        0.5, 0.95, 10)):
+    """Computes the average recall given an average number (percentile) of
+    proposals per video.
+
+    Args:
+        ground_truth (dict): Dict containing the ground truth instances.
+        proposals (dict): Dict containing the proposal instances.
+        total_num_proposals (int): Total number of proposals in the
+            proposal dict.
+        max_avg_proposals (int | None): Max number of proposals for one video.
+            Default: None.
+        temporal_iou_thresholds (np.ndarray): 1D array with temporal_iou
+            thresholds. Default: ``np.linspace(0.5, 0.95, 10)``.
+
+    Returns:
+        tuple([np.ndarray, np.ndarray, np.ndarray, float]):
+            (recall, average_recall, proposals_per_video, auc)
+            In recall, ``recall[i,j]`` is recall at i-th temporal_iou threshold
+            at the j-th average number (percentile) of average number of
+            proposals per video. The average_recall is recall averaged
+            over a list of temporal_iou threshold (1D array). This is
+            equivalent to ``recall.mean(axis=0)``. The ``proposals_per_video``
+            is the average number of proposals per video. The auc is the area
+            under ``AR@AN`` curve.
+    """
+
+    total_num_videos = len(ground_truth)
+
+    if not max_avg_proposals:
+        max_avg_proposals = float(total_num_proposals) / total_num_videos
+
+    ratio = (max_avg_proposals * float(total_num_videos) / total_num_proposals)
+
+    # For each video, compute temporal_iou scores among the retrieved proposals
+    score_list = []
+    total_num_retrieved_proposals = 0
+    for video_id in ground_truth:
+        # Get proposals for this video.
+        proposals_video_id = proposals[video_id]
+        this_video_proposals = proposals_video_id[:, :2]
+        # Sort proposals by score.
+        sort_idx = proposals_video_id[:, 2].argsort()[::-1]
+        this_video_proposals = this_video_proposals[sort_idx, :].astype(
+            np.float32)
+
+        # Get ground-truth instances associated to this video.
+        ground_truth_video_id = ground_truth[video_id]
+        this_video_ground_truth = ground_truth_video_id[:, :2].astype(
+            np.float32)
+        if this_video_proposals.shape[0] == 0:
+            n = this_video_ground_truth.shape[0]
+            score_list.append(np.zeros((n, 1)))
+            continue
+
+        if this_video_proposals.ndim != 2:
+            this_video_proposals = np.expand_dims(this_video_proposals, axis=0)
+        if this_video_ground_truth.ndim != 2:
+            this_video_ground_truth = np.expand_dims(
+                this_video_ground_truth, axis=0)
+
+        num_retrieved_proposals = np.minimum(
+            int(this_video_proposals.shape[0] * ratio),
+            this_video_proposals.shape[0])
+        total_num_retrieved_proposals += num_retrieved_proposals
+        this_video_proposals = this_video_proposals[:
+                                                    num_retrieved_proposals, :]
+
+        # Compute temporal_iou scores.
+        t_iou = pairwise_temporal_iou(this_video_proposals,
+                                      this_video_ground_truth)
+        score_list.append(t_iou)
+
+    # Given that the length of the videos is really varied, we
+    # compute the number of proposals in terms of a ratio of the total
+    # proposals retrieved, i.e. average recall at a percentage of proposals
+    # retrieved per video.
+
+    # Computes average recall.
+    pcn_list = np.arange(1, 101) / 100.0 * (
+        max_avg_proposals * float(total_num_videos) /
+        total_num_retrieved_proposals)
+    matches = np.empty((total_num_videos, pcn_list.shape[0]))
+    positives = np.empty(total_num_videos)
+    recall = np.empty((temporal_iou_thresholds.shape[0], pcn_list.shape[0]))
+    # Iterates over each temporal_iou threshold.
+    for ridx, temporal_iou in enumerate(temporal_iou_thresholds):
+        # Inspect positives retrieved per video at different
+        # number of proposals (percentage of the total retrieved).
+        for i, score in enumerate(score_list):
+            # Total positives per video.
+            positives[i] = score.shape[0]
+            # Find proposals that satisfies minimum temporal_iou threshold.
+            true_positives_temporal_iou = score >= temporal_iou
+            # Get number of proposals as a percentage of total retrieved.
+            pcn_proposals = np.minimum(
+                (score.shape[1] * pcn_list).astype(np.int32), score.shape[1])
+
+            for j, num_retrieved_proposals in enumerate(pcn_proposals):
+                # Compute the number of matches
+                # for each percentage of the proposals
+                matches[i, j] = np.count_nonzero(
+                    (true_positives_temporal_iou[:, :num_retrieved_proposals]
+                     ).sum(axis=1))
+
+        # Computes recall given the set of matches per video.
+        recall[ridx, :] = matches.sum(axis=0) / positives.sum()
+
+    # Recall is averaged.
+    avg_recall = recall.mean(axis=0)
+
+    # Get the average number of proposals per video.
+    proposals_per_video = pcn_list * (
+        float(total_num_retrieved_proposals) / total_num_videos)
+    # Get AUC
+    area_under_curve = np.trapz(avg_recall, proposals_per_video)
+    auc = 100. * float(area_under_curve) / proposals_per_video[-1]
+    return recall, avg_recall, proposals_per_video, auc
+
+
+def get_weighted_score(score_list, coeff_list):
+    """Get weighted score with given scores and coefficients.
+
+    Given n predictions by different classifier: [score_1, score_2, ...,
+    score_n] (score_list) and their coefficients: [coeff_1, coeff_2, ...,
+    coeff_n] (coeff_list), return weighted score: weighted_score =
+    score_1 * coeff_1 + score_2 * coeff_2 + ... + score_n * coeff_n
+
+    Args:
+        score_list (list[list[np.ndarray]]): List of list of scores, with shape
+            n(number of predictions) X num_samples X num_classes
+        coeff_list (list[float]): List of coefficients, with shape n.
+
+    Returns:
+        list[np.ndarray]: List of weighted scores.
+    """
+    assert len(score_list) == len(coeff_list)
+    num_samples = len(score_list[0])
+    for i in range(1, len(score_list)):
+        assert len(score_list[i]) == num_samples
+
+    scores = np.array(score_list)  # (num_coeff, num_samples, num_classes)
+    coeff = np.array(coeff_list)  # (num_coeff, )
+    weighted_scores = list(np.dot(scores.T, coeff).T)
+    return weighted_scores
+
+
+def softmax(x, dim=1):
+    """Compute softmax values for each sets of scores in x."""
+    e_x = np.exp(x - np.max(x, axis=dim, keepdims=True))
+    return e_x / e_x.sum(axis=dim, keepdims=True)
+
+
+def interpolated_precision_recall(precision, recall):
+    """Interpolated AP - VOCdevkit from VOC 2011.
+
+    Args:
+        precision (np.ndarray): The precision of different thresholds.
+        recall (np.ndarray): The recall of different thresholds.
+
+    Returns：
+        float: Average precision score.
+    """
+    mprecision = np.hstack([[0], precision, [0]])
+    mrecall = np.hstack([[0], recall, [1]])
+    for i in range(len(mprecision) - 1)[::-1]:
+        mprecision[i] = max(mprecision[i], mprecision[i + 1])
+    idx = np.where(mrecall[1::] != mrecall[0:-1])[0] + 1
+    ap = np.sum((mrecall[idx] - mrecall[idx - 1]) * mprecision[idx])
+    return ap
+
+
+def average_precision_at_temporal_iou(ground_truth,
+                                      prediction,
+                                      temporal_iou_thresholds=(np.linspace(
+                                          0.5, 0.95, 10))):
+    """Compute average precision (in detection task) between ground truth and
+    predicted data frames. If multiple predictions match the same predicted
+    segment, only the one with highest score is matched as true positive. This
+    code is greatly inspired by Pascal VOC devkit.
+
+    Args:
+        ground_truth (dict): Dict containing the ground truth instances.
+            Key: 'video_id'
+            Value (np.ndarray): 1D array of 't-start' and 't-end'.
+        prediction (np.ndarray): 2D array containing the information of
+            proposal instances, including 'video_id', 'class_id', 't-start',
+            't-end' and 'score'.
+        temporal_iou_thresholds (np.ndarray): 1D array with temporal_iou
+            thresholds. Default: ``np.linspace(0.5, 0.95, 10)``.
+
+    Returns:
+        np.ndarray: 1D array of average precision score.
+    """
+    ap = np.zeros(len(temporal_iou_thresholds), dtype=np.float32)
+    if len(prediction) < 1:
+        return ap
+
+    num_gts = 0.
+    lock_gt = dict()
+    for key in ground_truth:
+        lock_gt[key] = np.ones(
+            (len(temporal_iou_thresholds), len(ground_truth[key]))) * -1
+        num_gts += len(ground_truth[key])
+
+    # Sort predictions by decreasing score order.
+    prediction = np.array(prediction)
+    scores = prediction[:, 4].astype(float)
+    sort_idx = np.argsort(scores)[::-1]
+    prediction = prediction[sort_idx]
+
+    # Initialize true positive and false positive vectors.
+    tp = np.zeros((len(temporal_iou_thresholds), len(prediction)),
+                  dtype=np.int32)
+    fp = np.zeros((len(temporal_iou_thresholds), len(prediction)),
+                  dtype=np.int32)
+
+    # Assigning true positive to truly grount truth instances.
+    for idx, this_pred in enumerate(prediction):
+
+        # Check if there is at least one ground truth in the video.
+        if this_pred[0] in ground_truth:
+            this_gt = np.array(ground_truth[this_pred[0]], dtype=float)
+        else:
+            fp[:, idx] = 1
+            continue
+
+        t_iou = pairwise_temporal_iou(this_pred[2:4].astype(float), this_gt)
+        # We would like to retrieve the predictions with highest t_iou score.
+        t_iou_sorted_idx = t_iou.argsort()[::-1]
+        for t_idx, t_iou_threshold in enumerate(temporal_iou_thresholds):
+            for jdx in t_iou_sorted_idx:
+                if t_iou[jdx] < t_iou_threshold:
+                    fp[t_idx, idx] = 1
+                    break
+                if lock_gt[this_pred[0]][t_idx, jdx] >= 0:
+                    continue
+                # Assign as true positive after the filters above.
+                tp[t_idx, idx] = 1
+                lock_gt[this_pred[0]][t_idx, jdx] = idx
+                break
+
+            if fp[t_idx, idx] == 0 and tp[t_idx, idx] == 0:
+                fp[t_idx, idx] = 1
+
+    tp_cumsum = np.cumsum(tp, axis=1).astype(np.float32)
+    fp_cumsum = np.cumsum(fp, axis=1).astype(np.float32)
+    recall_cumsum = tp_cumsum / num_gts
+
+    precision_cumsum = tp_cumsum / (tp_cumsum + fp_cumsum)
+
+    for t_idx in range(len(temporal_iou_thresholds)):
+        ap[t_idx] = interpolated_precision_recall(precision_cumsum[t_idx, :],
+                                                  recall_cumsum[t_idx, :])
+
+    return ap
diff --git a/mmaction/evaluation/functional/ava_evaluation/README.md b/mmaction/evaluation/functional/ava_evaluation/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3137daf47cd98c0eb97b995a1937ddbf78f630b6
--- /dev/null
+++ b/mmaction/evaluation/functional/ava_evaluation/README.md
@@ -0,0 +1,2 @@
+The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet).
+Some unused codes are removed to minimize the length of codes added.
diff --git a/mmaction/evaluation/functional/ava_evaluation/__init__.py b/mmaction/evaluation/functional/ava_evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/mmaction/evaluation/functional/ava_evaluation/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/evaluation/functional/ava_evaluation/__pycache__/__init__.cpython-312.pyc b/mmaction/evaluation/functional/ava_evaluation/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6f1af82b7fe3fd031aa58c103ad0557d9587882
Binary files /dev/null and b/mmaction/evaluation/functional/ava_evaluation/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/evaluation/functional/ava_evaluation/__pycache__/metrics.cpython-312.pyc b/mmaction/evaluation/functional/ava_evaluation/__pycache__/metrics.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9354d9b08fdba8d27d245071ee3d863e57f45504
Binary files /dev/null and b/mmaction/evaluation/functional/ava_evaluation/__pycache__/metrics.cpython-312.pyc differ
diff --git a/mmaction/evaluation/functional/ava_evaluation/__pycache__/np_box_list.cpython-312.pyc b/mmaction/evaluation/functional/ava_evaluation/__pycache__/np_box_list.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b736b2a3c5a229762f1337dce719046994c0c3fe
Binary files /dev/null and b/mmaction/evaluation/functional/ava_evaluation/__pycache__/np_box_list.cpython-312.pyc differ
diff --git a/mmaction/evaluation/functional/ava_evaluation/__pycache__/np_box_ops.cpython-312.pyc b/mmaction/evaluation/functional/ava_evaluation/__pycache__/np_box_ops.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba5df459b4b35de205160d232b4da4d4bf6f9490
Binary files /dev/null and b/mmaction/evaluation/functional/ava_evaluation/__pycache__/np_box_ops.cpython-312.pyc differ
diff --git a/mmaction/evaluation/functional/ava_evaluation/metrics.py b/mmaction/evaluation/functional/ava_evaluation/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c6e83182e2521a5ebe4766103af0db619703054
--- /dev/null
+++ b/mmaction/evaluation/functional/ava_evaluation/metrics.py
@@ -0,0 +1,142 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Functions for computing metrics like precision, recall, CorLoc and etc."""
+
+import numpy as np
+
+
+def compute_precision_recall(scores, labels, num_gt):
+    """Compute precision and recall.
+
+    Args:
+        scores: A float numpy array representing detection score
+        labels: A boolean numpy array representing true/false positive labels
+        num_gt: Number of ground truth instances
+
+    Raises:
+        ValueError: if the input is not of the correct format
+
+    Returns:
+        precision: Fraction of positive instances over detected ones. This
+            value is None if no ground truth labels are present.
+        recall: Fraction of detected positive instance over all positive
+            instances. This value is None if no ground truth labels are
+            present.
+    """
+    if (not isinstance(labels, np.ndarray) or labels.dtype != bool
+            or len(labels.shape) != 1):
+        raise ValueError('labels must be single dimension bool numpy array')
+
+    if not isinstance(scores, np.ndarray) or len(scores.shape) != 1:
+        raise ValueError('scores must be single dimension numpy array')
+
+    if num_gt < np.sum(labels):
+        raise ValueError(
+            'Number of true positives must be smaller than num_gt.')
+
+    if len(scores) != len(labels):
+        raise ValueError('scores and labels must be of the same size.')
+
+    if num_gt == 0:
+        return None, None
+
+    sorted_indices = np.argsort(scores)
+    sorted_indices = sorted_indices[::-1]
+    labels = labels.astype(int)
+    true_positive_labels = labels[sorted_indices]
+    false_positive_labels = 1 - true_positive_labels
+    cum_true_positives = np.cumsum(true_positive_labels)
+    cum_false_positives = np.cumsum(false_positive_labels)
+    precision = cum_true_positives.astype(float) / (
+        cum_true_positives + cum_false_positives)
+    recall = cum_true_positives.astype(float) / num_gt
+    return precision, recall
+
+
+def compute_average_precision(precision, recall):
+    """Compute Average Precision according to the definition in VOCdevkit.
+
+    Precision is modified to ensure that it does not decrease as recall
+    decrease.
+
+    Args:
+        precision: A float [N, 1] numpy array of precisions
+        recall: A float [N, 1] numpy array of recalls
+
+    Raises:
+        ValueError: if the input is not of the correct format
+
+    Returns:
+        average_precison: The area under the precision recall curve. NaN if
+            precision and recall are None.
+    """
+    if precision is None:
+        if recall is not None:
+            raise ValueError('If precision is None, recall must also be None')
+        return np.NAN
+
+    if not isinstance(precision, np.ndarray) or not isinstance(
+            recall, np.ndarray):
+        raise ValueError('precision and recall must be numpy array')
+    if precision.dtype != np.float64 or recall.dtype != np.float64:
+        raise ValueError('input must be float numpy array.')
+    if len(precision) != len(recall):
+        raise ValueError('precision and recall must be of the same size.')
+    if not precision.size:
+        return 0.0
+    if np.amin(precision) < 0 or np.amax(precision) > 1:
+        raise ValueError('Precision must be in the range of [0, 1].')
+    if np.amin(recall) < 0 or np.amax(recall) > 1:
+        raise ValueError('recall must be in the range of [0, 1].')
+    if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)):
+        raise ValueError('recall must be a non-decreasing array')
+
+    recall = np.concatenate([[0], recall, [1]])
+    precision = np.concatenate([[0], precision, [0]])
+
+    # Preprocess precision to be a non-decreasing array
+    for i in range(len(precision) - 2, -1, -1):
+        precision[i] = np.maximum(precision[i], precision[i + 1])
+
+    indices = np.where(recall[1:] != recall[:-1])[0] + 1
+    average_precision = np.sum(
+        (recall[indices] - recall[indices - 1]) * precision[indices])
+    return average_precision
+
+
+def compute_cor_loc(num_gt_imgs_per_class,
+                    num_images_correctly_detected_per_class):
+    """Compute CorLoc according to the definition in the following paper.
+
+    https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf
+
+    Returns nans if there are no ground truth images for a class.
+
+    Args:
+        num_gt_imgs_per_class: 1D array, representing number of images
+            containing at least one object instance of a particular class
+        num_images_correctly_detected_per_class: 1D array, representing number
+            of images that are correctly detected at least one object instance
+            of a particular class
+
+    Returns:
+        corloc_per_class: A float numpy array represents the corloc score of
+            each class
+    """
+    # Divide by zero expected for classes with no gt examples.
+    with np.errstate(divide='ignore', invalid='ignore'):
+        return np.where(
+            num_gt_imgs_per_class == 0, np.nan,
+            num_images_correctly_detected_per_class / num_gt_imgs_per_class)
diff --git a/mmaction/evaluation/functional/ava_evaluation/np_box_list.py b/mmaction/evaluation/functional/ava_evaluation/np_box_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..528cbb5539bca21a54b240ee284b59a200e33006
--- /dev/null
+++ b/mmaction/evaluation/functional/ava_evaluation/np_box_list.py
@@ -0,0 +1,139 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Numpy BoxList classes and functions."""
+
+import numpy as np
+
+
+class BoxList:
+    """Box collection.
+
+    BoxList represents a list of bounding boxes as numpy array, where each
+    bounding box is represented as a row of 4 numbers,
+    [y_min, x_min, y_max, x_max].  It is assumed that all bounding boxes within
+    a given list correspond to a single image.
+
+    Optionally, users can add additional related fields (such as
+    objectness/classification scores).
+    """
+
+    def __init__(self, data):
+        """Constructs box collection.
+
+        Args:
+            data: a numpy array of shape [N, 4] representing box coordinates
+
+        Raises:
+            ValueError: if bbox data is not a numpy array
+            ValueError: if invalid dimensions for bbox data
+        """
+        if not isinstance(data, np.ndarray):
+            raise ValueError('data must be a numpy array.')
+        if len(data.shape) != 2 or data.shape[1] != 4:
+            raise ValueError('Invalid dimensions for box data.')
+        if data.dtype != np.float32 and data.dtype != np.float64:
+            raise ValueError(
+                'Invalid data type for box data: float is required.')
+        if not self._is_valid_boxes(data):
+            raise ValueError('Invalid box data. data must be a numpy array of '
+                             'N*[y_min, x_min, y_max, x_max]')
+        self.data = {'boxes': data}
+
+    def num_boxes(self):
+        """Return number of boxes held in collections."""
+        return self.data['boxes'].shape[0]
+
+    def get_extra_fields(self):
+        """Return all non-box fields."""
+        return [k for k in self.data if k != 'boxes']
+
+    def has_field(self, field):
+        return field in self.data
+
+    def add_field(self, field, field_data):
+        """Add data to a specified field.
+
+        Args:
+            field: a string parameter used to specify a related field to be
+                accessed.
+            field_data: a numpy array of [N, ...] representing the data
+                associated with the field.
+        Raises:
+            ValueError: if the field is already exist or the dimension of the
+                field data does not matches the number of boxes.
+        """
+        if self.has_field(field):
+            raise ValueError('Field ' + field + 'already exists')
+        if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(
+        ):
+            raise ValueError('Invalid dimensions for field data')
+        self.data[field] = field_data
+
+    def get(self):
+        """Convenience function for accesssing box coordinates.
+
+        Returns:
+            a numpy array of shape [N, 4] representing box corners
+        """
+        return self.get_field('boxes')
+
+    def get_field(self, field):
+        """Accesses data associated with the specified field in the box
+        collection.
+
+        Args:
+            field: a string parameter used to specify a related field to be
+                accessed.
+
+        Returns:
+            a numpy 1-d array representing data of an associated field
+
+        Raises:
+            ValueError: if invalid field
+        """
+        if not self.has_field(field):
+            raise ValueError(f'field {field} does not exist')
+        return self.data[field]
+
+    def get_coordinates(self):
+        """Get corner coordinates of boxes.
+
+        Returns:
+            a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max]
+        """
+        box_coordinates = self.get()
+        y_min = box_coordinates[:, 0]
+        x_min = box_coordinates[:, 1]
+        y_max = box_coordinates[:, 2]
+        x_max = box_coordinates[:, 3]
+        return [y_min, x_min, y_max, x_max]
+
+    @staticmethod
+    def _is_valid_boxes(data):
+        """Check whether data fulfills the format of N*[ymin, xmin, ymax,
+        xmin].
+
+        Args:
+            data: a numpy array of shape [N, 4] representing box coordinates
+
+        Returns:
+            a boolean indicating whether all ymax of boxes are equal or greater
+            than ymin, and all xmax of boxes are equal or greater than xmin.
+        """
+        if len(data) != 0:
+            for v in data:
+                if v[0] > v[2] or v[1] > v[3]:
+                    return False
+        return True
diff --git a/mmaction/evaluation/functional/ava_evaluation/np_box_ops.py b/mmaction/evaluation/functional/ava_evaluation/np_box_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b542383045d13f27bc8a869abd9cb31df603e788
--- /dev/null
+++ b/mmaction/evaluation/functional/ava_evaluation/np_box_ops.py
@@ -0,0 +1,98 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for [N, 4] numpy arrays representing bounding boxes.
+
+Example box operations that are supported:
+    * Areas: compute bounding box areas
+    * IOU: pairwise intersection-over-union scores
+"""
+
+import numpy as np
+
+
+def area(boxes):
+    """Computes area of boxes.
+
+    Args:
+        boxes: Numpy array with shape [N, 4] holding N boxes
+
+    Returns:
+        a numpy array with shape [N*1] representing box areas
+    """
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def intersection(boxes1, boxes2):
+    """Compute pairwise intersection areas between boxes.
+
+    Args:
+        boxes1: a numpy array with shape [N, 4] holding N boxes
+        boxes2: a numpy array with shape [M, 4] holding M boxes
+
+    Returns:
+        a numpy array with shape [N*M] representing pairwise intersection area
+    """
+    [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
+    [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
+
+    all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
+    all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
+    intersect_heights = np.maximum(
+        np.zeros(all_pairs_max_ymin.shape),
+        all_pairs_min_ymax - all_pairs_max_ymin)
+    all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
+    all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
+    intersect_widths = np.maximum(
+        np.zeros(all_pairs_max_xmin.shape),
+        all_pairs_min_xmax - all_pairs_max_xmin)
+    return intersect_heights * intersect_widths
+
+
+def iou(boxes1, boxes2):
+    """Computes pairwise intersection-over-union between box collections.
+
+    Args:
+        boxes1: a numpy array with shape [N, 4] holding N boxes.
+        boxes2: a numpy array with shape [M, 4] holding N boxes.
+
+    Returns:
+        a numpy array with shape [N, M] representing pairwise iou scores.
+    """
+    intersect = intersection(boxes1, boxes2)
+    area1 = area(boxes1)
+    area2 = area(boxes2)
+    union = (
+        np.expand_dims(area1, axis=1) + np.expand_dims(area2, axis=0) -
+        intersect)
+    return intersect / union
+
+
+def ioa(boxes1, boxes2):
+    """Computes pairwise intersection-over-area between box collections.
+
+    Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
+    their intersection area over box2's area. Note that ioa is not symmetric,
+    that is, IOA(box1, box2) != IOA(box2, box1).
+
+    Args:
+        boxes1: a numpy array with shape [N, 4] holding N boxes.
+        boxes2: a numpy array with shape [M, 4] holding N boxes.
+
+    Returns:
+        a numpy array with shape [N, M] representing pairwise ioa scores.
+    """
+    intersect = intersection(boxes1, boxes2)
+    areas = np.expand_dims(area(boxes2), axis=0)
+    return intersect / areas
diff --git a/mmaction/evaluation/functional/ava_utils.py b/mmaction/evaluation/functional/ava_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e144779c5c7354a0767e4c0456d3386e5783530
--- /dev/null
+++ b/mmaction/evaluation/functional/ava_utils.py
@@ -0,0 +1,300 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This piece of code is directly adapted from ActivityNet official repo
+# https://github.com/activitynet/ActivityNet/blob/master/
+# Evaluation/get_ava_performance.py. Some unused codes are removed.
+import csv
+import multiprocessing
+import time
+from collections import defaultdict
+
+import numpy as np
+
+from .ava_evaluation import metrics, np_box_list, np_box_ops
+
+
+def det2csv(results, custom_classes):
+    """Convert detection results to csv file."""
+    csv_results = []
+    for idx in range(len(results)):
+        video_id = results[idx]['video_id']
+        timestamp = results[idx]['timestamp']
+        result = results[idx]['outputs']
+        for label, _ in enumerate(result):
+            for bbox in result[label]:
+                bbox_ = tuple(bbox.tolist())
+                if custom_classes is not None:
+                    actual_label = custom_classes[label + 1]
+                else:
+                    actual_label = label + 1
+                csv_results.append((
+                    video_id,
+                    timestamp,
+                ) + bbox_[:4] + (actual_label, ) + bbox_[4:])
+    return csv_results
+
+
+# results is organized by class
+def results2csv(results, out_file, custom_classes=None):
+    """Convert detection results to csv file."""
+    csv_results = det2csv(results, custom_classes)
+
+    # save space for float
+    def to_str(item):
+        if isinstance(item, float):
+            return f'{item:.4f}'
+        return str(item)
+
+    with open(out_file, 'w') as f:
+        for csv_result in csv_results:
+            f.write(','.join(map(to_str, csv_result)))
+            f.write('\n')
+
+
+def print_time(message, start):
+    """Print processing time."""
+    print('==> %g seconds to %s' % (time.time() - start, message), flush=True)
+
+
+def make_image_key(video_id, timestamp):
+    """Returns a unique identifier for a video id & timestamp."""
+    return f'{video_id},{int(timestamp):04d}'
+
+
+def read_csv(csv_file, class_whitelist=None):
+    """Loads boxes and class labels from a CSV file in the AVA format.
+
+    CSV file format described at https://research.google.com/ava/download.html.
+
+    Args:
+        csv_file: A file object.
+        class_whitelist: If provided, boxes corresponding to (integer) class
+        labels not in this set are skipped.
+
+    Returns:
+        boxes: A dictionary mapping each unique image key (string) to a list of
+        boxes, given as coordinates [y1, x1, y2, x2].
+        labels: A dictionary mapping each unique image key (string) to a list
+        of integer class labels, matching the corresponding box in `boxes`.
+        scores: A dictionary mapping each unique image key (string) to a list
+        of score values labels, matching the corresponding label in `labels`.
+        If scores are not provided in the csv, then they will default to 1.0.
+    """
+    entries = defaultdict(list)
+    boxes = defaultdict(list)
+    labels = defaultdict(list)
+    scores = defaultdict(list)
+    reader = csv.reader(csv_file)
+    for row in reader:
+        assert len(row) in [7, 8], 'Wrong number of columns: ' + row
+        image_key = make_image_key(row[0], row[1])
+        x1, y1, x2, y2 = [float(n) for n in row[2:6]]
+        action_id = int(row[6])
+        if class_whitelist and action_id not in class_whitelist:
+            continue
+
+        score = 1.0
+        if len(row) == 8:
+            score = float(row[7])
+
+        entries[image_key].append((score, action_id, y1, x1, y2, x2))
+
+    for image_key in entries:
+        # Evaluation API assumes boxes with descending scores
+        entry = sorted(entries[image_key], key=lambda tup: -tup[0])
+        boxes[image_key] = [x[2:] for x in entry]
+        labels[image_key] = [x[1] for x in entry]
+        scores[image_key] = [x[0] for x in entry]
+
+    return boxes, labels, scores
+
+
+def read_exclusions(exclusions_file):
+    """Reads a CSV file of excluded timestamps.
+
+    Args:
+        exclusions_file: A file object containing a csv of video-id,timestamp.
+
+    Returns:
+        A set of strings containing excluded image keys, e.g.
+        "aaaaaaaaaaa,0904",
+        or an empty set if exclusions file is None.
+    """
+    excluded = set()
+    if exclusions_file:
+        reader = csv.reader(exclusions_file)
+    for row in reader:
+        assert len(row) == 2, f'Expected only 2 columns, got: {row}'
+        excluded.add(make_image_key(row[0], row[1]))
+    return excluded
+
+
+def read_labelmap(labelmap_file):
+    """Reads a labelmap without the dependency on protocol buffers.
+
+    Args:
+        labelmap_file: A file object containing a label map protocol buffer.
+
+    Returns:
+        labelmap: The label map in the form used by the
+        object_detection_evaluation
+        module - a list of {"id": integer, "name": classname } dicts.
+        class_ids: A set containing all of the valid class id integers.
+    """
+    labelmap = []
+    class_ids = set()
+    name = ''
+    class_id = ''
+    for line in labelmap_file:
+        if line.startswith('  name:'):
+            name = line.split('"')[1]
+        elif line.startswith('  id:') or line.startswith('  label_id:'):
+            class_id = int(line.strip().split(' ')[-1])
+            labelmap.append({'id': class_id, 'name': name})
+            class_ids.add(class_id)
+    return labelmap, class_ids
+
+
+def get_overlaps_and_scores_box_mode(detected_boxes, detected_scores,
+                                     groundtruth_boxes):
+
+    detected_boxlist = np_box_list.BoxList(detected_boxes)
+    detected_boxlist.add_field('scores', detected_scores)
+    gt_non_group_of_boxlist = np_box_list.BoxList(groundtruth_boxes)
+
+    iou = np_box_ops.iou(detected_boxlist.get(), gt_non_group_of_boxlist.get())
+    scores = detected_boxlist.get_field('scores')
+    num_boxes = detected_boxlist.num_boxes()
+    return iou, scores, num_boxes
+
+
+def tpfp_single(tup, threshold=0.5):
+    gt_bboxes, gt_labels, bboxes, labels, scores = tup
+    ret_scores, ret_tp_fp_labels = dict(), dict()
+    all_labels = list(set(labels))
+    for label in all_labels:
+        gt_bbox = np.array(
+            [x for x, y in zip(gt_bboxes, gt_labels) if y == label],
+            dtype=np.float32).reshape(-1, 4)
+        bbox = np.array([x for x, y in zip(bboxes, labels) if y == label],
+                        dtype=np.float32).reshape(-1, 4)
+        score = np.array([x for x, y in zip(scores, labels) if y == label],
+                         dtype=np.float32).reshape(-1)
+        iou, score, num_boxes = get_overlaps_and_scores_box_mode(
+            bbox, score, gt_bbox)
+        if gt_bbox.size == 0:
+            ret_scores[label] = score
+            ret_tp_fp_labels[label] = np.zeros(num_boxes, dtype=bool)
+            continue
+        tp_fp_labels = np.zeros(num_boxes, dtype=bool)
+        if iou.shape[1] > 0:
+            max_overlap_gt_ids = np.argmax(iou, axis=1)
+            is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool)
+            for i in range(num_boxes):
+                gt_id = max_overlap_gt_ids[i]
+                if iou[i, gt_id] >= threshold:
+                    if not is_gt_box_detected[gt_id]:
+                        tp_fp_labels[i] = True
+                        is_gt_box_detected[gt_id] = True
+        ret_scores[label], ret_tp_fp_labels[label] = score, tp_fp_labels
+    return ret_scores, ret_tp_fp_labels
+
+
+# Seems there is at most 100 detections for each image
+def ava_eval(result_file,
+             result_type,
+             label_file,
+             ann_file,
+             exclude_file,
+             verbose=True,
+             ignore_empty_frames=True,
+             custom_classes=None):
+    """Perform ava evaluation."""
+
+    assert result_type in ['mAP']
+    start = time.time()
+    categories, class_whitelist = read_labelmap(open(label_file))
+    if custom_classes is not None:
+        custom_classes = custom_classes[1:]
+        assert set(custom_classes).issubset(set(class_whitelist))
+        class_whitelist = custom_classes
+        categories = [cat for cat in categories if cat['id'] in custom_classes]
+
+    # loading gt, do not need gt score
+    gt_bboxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist)
+    if verbose:
+        print_time('Reading GT results', start)
+
+    if exclude_file is not None:
+        excluded_keys = read_exclusions(open(exclude_file))
+    else:
+        excluded_keys = list()
+
+    start = time.time()
+    boxes, labels, scores = read_csv(open(result_file), class_whitelist)
+    if verbose:
+        print_time('Reading Detection results', start)
+
+    start = time.time()
+    all_gt_labels = np.concatenate(list(gt_labels.values()))
+    gt_count = {k: np.sum(all_gt_labels == k) for k in class_whitelist}
+
+    pool = multiprocessing.Pool(32)
+    if ignore_empty_frames:
+        tups = [(gt_bboxes[k], gt_labels[k], boxes[k], labels[k], scores[k])
+                for k in gt_bboxes if k not in excluded_keys]
+    else:
+        tups = [(gt_bboxes.get(k, np.zeros((0, 4), dtype=np.float32)),
+                 gt_labels.get(k, []), boxes[k], labels[k], scores[k])
+                for k in boxes if k not in excluded_keys]
+    rets = pool.map(tpfp_single, tups)
+
+    if verbose:
+        print_time('Calculating TP/FP', start)
+
+    start = time.time()
+    scores, tpfps = defaultdict(list), defaultdict(list)
+    for score, tpfp in rets:
+        for k in score:
+            scores[k].append(score[k])
+            tpfps[k].append(tpfp[k])
+
+    cls_AP = []
+    for k in scores:
+        scores[k] = np.concatenate(scores[k])
+        tpfps[k] = np.concatenate(tpfps[k])
+        precision, recall = metrics.compute_precision_recall(
+            scores[k], tpfps[k], gt_count[k])
+        ap = metrics.compute_average_precision(precision, recall)
+        class_name = [x['name'] for x in categories if x['id'] == k]
+        assert len(class_name) == 1
+        class_name = class_name[0]
+        cls_AP.append((k, class_name, ap))
+    if verbose:
+        print_time('Run Evaluator', start)
+
+    print('Per-class results: ', flush=True)
+    for k, class_name, ap in cls_AP:
+        print(f'Index: {k}, Action: {class_name}: AP: {ap:.4f};', flush=True)
+
+    overall = np.nanmean([x[2] for x in cls_AP])
+    person_movement = np.nanmean([x[2] for x in cls_AP if x[0] <= 14])
+    object_manipulation = np.nanmean([x[2] for x in cls_AP if 14 < x[0] < 64])
+    person_interaction = np.nanmean([x[2] for x in cls_AP if 64 <= x[0]])
+
+    print('Overall Results: ', flush=True)
+    print(f'Overall mAP: {overall:.4f}', flush=True)
+    print(f'Person Movement mAP: {person_movement:.4f}', flush=True)
+    print(f'Object Manipulation mAP: {object_manipulation:.4f}', flush=True)
+    print(f'Person Interaction mAP: {person_interaction:.4f}', flush=True)
+
+    results = {}
+    results['overall'] = overall
+    results['person_movement'] = person_movement
+    results['object_manipulation'] = object_manipulation
+    results['person_interaction'] = person_interaction
+
+    if verbose:
+        for k, class_name, ap in cls_AP:
+            print(f'Class {class_name} AP: {ap:.4f}', flush=True)
+
+    return results
diff --git a/mmaction/evaluation/functional/eval_detection.py b/mmaction/evaluation/functional/eval_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a32a8d1c92631cdf649501f6859ce0e233f2fb3
--- /dev/null
+++ b/mmaction/evaluation/functional/eval_detection.py
@@ -0,0 +1,233 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+import numpy as np
+from mmengine.logging import MMLogger, print_log
+
+from .accuracy import interpolated_precision_recall, pairwise_temporal_iou
+
+
+class ActivityNetLocalization:
+    """Class to evaluate detection results on ActivityNet.
+
+    Args:
+        ground_truth_filename (str | None): The filename of groundtruth.
+            Default: None.
+        prediction_filename (str | None): The filename of action detection
+            results. Default: None.
+        tiou_thresholds (np.ndarray): The thresholds of temporal iou to
+            evaluate. Default: ``np.linspace(0.5, 0.95, 10)``.
+        verbose (bool): Whether to print verbose logs. Default: False.
+    """
+
+    def __init__(self,
+                 ground_truth_filename=None,
+                 prediction_filename=None,
+                 tiou_thresholds=np.linspace(0.5, 0.95, 10),
+                 verbose=False):
+        if not ground_truth_filename:
+            raise IOError('Please input a valid ground truth file.')
+        if not prediction_filename:
+            raise IOError('Please input a valid prediction file.')
+        self.ground_truth_filename = ground_truth_filename
+        self.prediction_filename = prediction_filename
+        self.tiou_thresholds = tiou_thresholds
+        self.verbose = verbose
+        self.ap = None
+        self.logger = MMLogger.get_current_instance()
+        # Import ground truth and predictions.
+        self.ground_truth, self.activity_index = self._import_ground_truth(
+            ground_truth_filename)
+        self.prediction = self._import_prediction(prediction_filename)
+
+        if self.verbose:
+            log_msg = (
+                '[INIT] Loaded ground_truth from '
+                f'{self.ground_truth_filename}, prediction from '
+                f'{self.prediction_filename}.\n'
+                f'Number of ground truth instances: {len(self.ground_truth)}\n'
+                f'Number of predictions: {len(self.prediction)}\n'
+                f'Fixed threshold for tiou score: {self.tiou_thresholds}')
+            print_log(log_msg, logger=self.logger)
+
+    @staticmethod
+    def _import_ground_truth(ground_truth_filename):
+        """Read ground truth file and return the ground truth instances and the
+        activity classes.
+
+        Args:
+            ground_truth_filename (str): Full path to the ground truth json
+                file.
+
+        Returns:
+            tuple[list, dict]: (ground_truth, activity_index).
+                ground_truth contains the ground truth instances, which is in a
+                    dict format.
+                activity_index contains classes index.
+        """
+        with open(ground_truth_filename, 'r') as f:
+            data = json.load(f)
+        # Checking format
+        activity_index, class_idx = {}, 0
+        ground_truth = []
+        for video_id, video_info in data.items():
+            for anno in video_info['annotations']:
+                if anno['label'] not in activity_index:
+                    activity_index[anno['label']] = class_idx
+                    class_idx += 1
+                # old video_anno
+                ground_truth_item = {}
+                ground_truth_item['video-id'] = video_id[2:]
+                ground_truth_item['t-start'] = float(anno['segment'][0])
+                ground_truth_item['t-end'] = float(anno['segment'][1])
+                ground_truth_item['label'] = activity_index[anno['label']]
+                ground_truth.append(ground_truth_item)
+
+        return ground_truth, activity_index
+
+    def _import_prediction(self, prediction_filename):
+        """Read prediction file and return the prediction instances.
+
+        Args:
+            prediction_filename (str): Full path to the prediction json file.
+
+        Returns:
+            List: List containing the prediction instances (dictionaries).
+        """
+        with open(prediction_filename, 'r') as f:
+            data = json.load(f)
+        # Read predictions.
+        prediction = []
+        for video_id, video_info in data['results'].items():
+            for result in video_info:
+                prediction_item = dict()
+                prediction_item['video-id'] = video_id
+                prediction_item['label'] = self.activity_index[result['label']]
+                prediction_item['t-start'] = float(result['segment'][0])
+                prediction_item['t-end'] = float(result['segment'][1])
+                prediction_item['score'] = result['score']
+                prediction.append(prediction_item)
+
+        return prediction
+
+    def wrapper_compute_average_precision(self):
+        """Computes average precision for each class."""
+        ap = np.zeros((len(self.tiou_thresholds), len(self.activity_index)))
+
+        # Adaptation to query faster
+        ground_truth_by_label = []
+        prediction_by_label = []
+        for i in range(len(self.activity_index)):
+            ground_truth_by_label.append([])
+            prediction_by_label.append([])
+        for gt in self.ground_truth:
+            ground_truth_by_label[gt['label']].append(gt)
+        for pred in self.prediction:
+            prediction_by_label[pred['label']].append(pred)
+
+        for i in range(len(self.activity_index)):
+            ap_result = compute_average_precision_detection(
+                ground_truth_by_label[i], prediction_by_label[i],
+                self.tiou_thresholds)
+            ap[:, i] = ap_result
+
+        return ap
+
+    def evaluate(self):
+        """Evaluates a prediction file.
+
+        For the detection task we measure the interpolated mean average
+        precision to measure the performance of a method.
+        """
+        self.ap = self.wrapper_compute_average_precision()
+
+        self.mAP = self.ap.mean(axis=1)
+        self.average_mAP = self.mAP.mean()
+
+        return self.mAP, self.average_mAP
+
+
+def compute_average_precision_detection(ground_truth,
+                                        prediction,
+                                        tiou_thresholds=np.linspace(
+                                            0.5, 0.95, 10)):
+    """Compute average precision (detection task) between ground truth and
+    predictions data frames. If multiple predictions occurs for the same
+    predicted segment, only the one with highest score is matches as true
+    positive. This code is greatly inspired by Pascal VOC devkit.
+
+    Args:
+        ground_truth (list[dict]): List containing the ground truth instances
+            (dictionaries). Required keys are 'video-id', 't-start' and
+            't-end'.
+        prediction (list[dict]): List containing the prediction instances
+            (dictionaries). Required keys are: 'video-id', 't-start', 't-end'
+            and 'score'.
+        tiou_thresholds (np.ndarray): A 1darray indicates the temporal
+            intersection over union threshold, which is optional.
+            Default: ``np.linspace(0.5, 0.95, 10)``.
+
+    Returns:
+        Float: ap, Average precision score.
+    """
+    num_thresholds = len(tiou_thresholds)
+    num_gts = len(ground_truth)
+    num_preds = len(prediction)
+    ap = np.zeros(num_thresholds)
+    if len(prediction) == 0:
+        return ap
+
+    num_positive = float(num_gts)
+    lock_gt = np.ones((num_thresholds, num_gts)) * -1
+    # Sort predictions by decreasing score order.
+    prediction.sort(key=lambda x: -x['score'])
+    # Initialize true positive and false positive vectors.
+    tp = np.zeros((num_thresholds, num_preds))
+    fp = np.zeros((num_thresholds, num_preds))
+
+    # Adaptation to query faster
+    ground_truth_by_videoid = {}
+    for i, item in enumerate(ground_truth):
+        item['index'] = i
+        ground_truth_by_videoid.setdefault(item['video-id'], []).append(item)
+
+    # Assigning true positive to truly grount truth instances.
+    for idx, pred in enumerate(prediction):
+        if pred['video-id'] in ground_truth_by_videoid:
+            gts = ground_truth_by_videoid[pred['video-id']]
+        else:
+            fp[:, idx] = 1
+            continue
+
+        tiou_arr = pairwise_temporal_iou(
+            np.array([pred['t-start'], pred['t-end']]),
+            np.array([np.array([gt['t-start'], gt['t-end']]) for gt in gts]))
+        tiou_arr = tiou_arr.reshape(-1)
+        # We would like to retrieve the predictions with highest tiou score.
+        tiou_sorted_idx = tiou_arr.argsort()[::-1]
+        for t_idx, tiou_threshold in enumerate(tiou_thresholds):
+            for j_idx in tiou_sorted_idx:
+                if tiou_arr[j_idx] < tiou_threshold:
+                    fp[t_idx, idx] = 1
+                    break
+                if lock_gt[t_idx, gts[j_idx]['index']] >= 0:
+                    continue
+                # Assign as true positive after the filters above.
+                tp[t_idx, idx] = 1
+                lock_gt[t_idx, gts[j_idx]['index']] = idx
+                break
+
+            if fp[t_idx, idx] == 0 and tp[t_idx, idx] == 0:
+                fp[t_idx, idx] = 1
+
+    tp_cumsum = np.cumsum(tp, axis=1).astype(np.float64)
+    fp_cumsum = np.cumsum(fp, axis=1).astype(np.float64)
+    recall_cumsum = tp_cumsum / num_positive
+
+    precision_cumsum = tp_cumsum / (tp_cumsum + fp_cumsum)
+
+    for t_idx in range(len(tiou_thresholds)):
+        ap[t_idx] = interpolated_precision_recall(precision_cumsum[t_idx, :],
+                                                  recall_cumsum[t_idx, :])
+
+    return ap
diff --git a/mmaction/evaluation/functional/multisports_utils.py b/mmaction/evaluation/functional/multisports_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4d08e6eb3a73781cef34883c113c5cb3d6b451
--- /dev/null
+++ b/mmaction/evaluation/functional/multisports_utils.py
@@ -0,0 +1,685 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/MCG-NJU/MultiSports
+# Original licence: Copyright (c) MCG-NJU, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import math
+from collections import defaultdict
+
+import numpy as np
+from mmengine.logging import MMLogger
+from rich.progress import track
+
+
+def area2d_voc(b):
+    """Compute the areas for a set of 2D boxes."""
+    return (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
+
+
+def overlap2d_voc(b1, b2):
+    """Compute the overlaps between a set of boxes b1 and one box b2."""
+    xmin = np.maximum(b1[:, 0], b2[:, 0])
+    ymin = np.maximum(b1[:, 1], b2[:, 1])
+    xmax = np.minimum(b1[:, 2], b2[:, 2])
+    ymax = np.minimum(b1[:, 3], b2[:, 3])
+
+    width = np.maximum(0, xmax - xmin)
+    height = np.maximum(0, ymax - ymin)
+
+    return width * height
+
+
+def iou2d_voc(b1, b2):
+    """Compute the IoU between a set of boxes b1 and 1 box b2."""
+    if b1.ndim == 1:
+        b1 = b1[None, :]
+    if b2.ndim == 1:
+        b2 = b2[None, :]
+
+    assert b2.shape[0] == 1
+
+    ov = overlap2d_voc(b1, b2)
+
+    return ov / (area2d_voc(b1) + area2d_voc(b2) - ov)
+
+
+def iou3d_voc(b1, b2):
+    """Compute the IoU between two tubes with same temporal extent."""
+    assert b1.shape[0] == b2.shape[0]
+    assert np.all(b1[:, 0] == b2[:, 0])
+
+    ov = overlap2d_voc(b1[:, 1:5], b2[:, 1:5])
+
+    return np.mean(ov / (area2d_voc(b1[:, 1:5]) + area2d_voc(b2[:, 1:5]) - ov))
+
+
+def iou3dt_voc(b1, b2, spatialonly=False, temporalonly=False):
+    """Compute the spatio-temporal IoU between two tubes."""
+    tmin = max(b1[0, 0], b2[0, 0])
+    tmax = min(b1[-1, 0], b2[-1, 0])
+
+    if tmax < tmin:
+        return 0.0
+
+    temporal_inter = tmax - tmin
+    temporal_union = max(b1[-1, 0], b2[-1, 0]) - min(b1[0, 0], b2[0, 0])
+
+    tube1 = b1[int(np.where(
+        b1[:, 0] == tmin)[0]):int(np.where(b1[:, 0] == tmax)[0]) + 1, :]
+    tube2 = b2[int(np.where(
+        b2[:, 0] == tmin)[0]):int(np.where(b2[:, 0] == tmax)[0]) + 1, :]
+
+    if temporalonly:
+        return temporal_inter / temporal_union
+    return iou3d_voc(tube1, tube2) * (1. if spatialonly else temporal_inter /
+                                      temporal_union)
+
+
+def pr_to_ap_voc(pr):
+    precision = pr[:, 0]
+    recall = pr[:, 1]
+    recall = np.concatenate([[0], recall, [1]])
+    precision = np.concatenate([[0], precision, [0]])
+
+    # Preprocess precision to be a non-decreasing array
+    for i in range(len(precision) - 2, -1, -1):
+        precision[i] = np.maximum(precision[i], precision[i + 1])
+
+    indices = np.where(recall[1:] != recall[:-1])[0] + 1
+    average_precision = np.sum(
+        (recall[indices] - recall[indices - 1]) * precision[indices])
+    return average_precision
+
+
+def nms_tubelets(dets, overlapThresh=0.3, top_k=None):
+    """Compute the NMS for a set of scored tubelets scored tubelets are numpy
+    array with 4K+1 columns, last one being the score return the indices of the
+    tubelets to keep."""
+
+    # If there are no detections, return an empty list
+    if len(dets) == 0:
+        return dets
+    if top_k is None:
+        top_k = len(dets)
+
+    K = int((dets.shape[1] - 1) / 4)
+
+    # Coordinates of bounding boxes
+    x1 = [dets[:, 4 * k] for k in range(K)]
+    y1 = [dets[:, 4 * k + 1] for k in range(K)]
+    x2 = [dets[:, 4 * k + 2] for k in range(K)]
+    y2 = [dets[:, 4 * k + 3] for k in range(K)]
+
+    # Compute the area of the bounding boxes and sort the bounding
+    # boxes by the bottom-right y-coordinate of the bounding box
+    # area = (x2 - x1 + 1) * (y2 - y1 + 1)
+    scores = dets[:, -1]
+    area = [(x2[k] - x1[k] + 1) * (y2[k] - y1[k] + 1) for k in range(K)]
+    order = np.argsort(scores)[::-1]
+    weight = np.zeros_like(scores) + 1
+    counter = 0
+
+    while order.size > 0:
+        i = order[0]
+        counter += 1
+
+        # Compute overlap
+        xx1 = [np.maximum(x1[k][i], x1[k][order[1:]]) for k in range(K)]
+        yy1 = [np.maximum(y1[k][i], y1[k][order[1:]]) for k in range(K)]
+        xx2 = [np.minimum(x2[k][i], x2[k][order[1:]]) for k in range(K)]
+        yy2 = [np.minimum(y2[k][i], y2[k][order[1:]]) for k in range(K)]
+
+        w = [np.maximum(0, xx2[k] - xx1[k] + 1) for k in range(K)]
+        h = [np.maximum(0, yy2[k] - yy1[k] + 1) for k in range(K)]
+
+        inter_area = [w[k] * h[k] for k in range(K)]
+        ious = sum([
+            inter_area[k] / (area[k][order[1:]] + area[k][i] - inter_area[k])
+            for k in range(K)
+        ])
+        index = np.where(ious > overlapThresh * K)[0]
+        weight[order[index + 1]] = 1 - ious[index]
+
+        index2 = np.where(ious <= overlapThresh * K)[0]
+        order = order[index2 + 1]
+
+    dets[:, -1] = dets[:, -1] * weight
+
+    new_scores = dets[:, -1]
+    new_order = np.argsort(new_scores)[::-1]
+    dets = dets[new_order, :]
+
+    return dets[:top_k, :]
+
+
+class Dataset():
+
+    def __init__(self, anno, frm_alldets) -> None:
+        self.anno = anno
+        self.video_list = self.anno['test_videos'][0]
+        self.nframes = self.anno['nframes']
+        self.labels = self.anno['labels']
+        self.frm_alldets = frm_alldets
+
+    def get_vid_dets(self):
+        self.vid_frm_det = defaultdict(list)
+        for frm_det in self.frm_alldets:
+            vid_idx = int(frm_det[0])
+            vid_name = self.video_list[vid_idx]
+            self.vid_frm_det[vid_name].append(frm_det)
+
+        self.vid_det = dict()
+        for vid_name, vid_frm_dets in self.vid_frm_det.items():
+            self.vid_det[vid_name] = dict()
+            for frm_idx in range(1, self.nframes[vid_name] + 1):
+                self.vid_det[vid_name][frm_idx] = dict()
+                for label_idx in range(len(self.labels)):
+                    self.vid_det[vid_name][frm_idx][label_idx] = np.empty(
+                        shape=(0, 5))
+            for frm_dets in vid_frm_dets:
+                frm_idx = int(frm_dets[1])
+                label_idx = int(frm_dets[2])
+                det = [*frm_dets[-4:], frm_det[3]]
+                det = np.array(det)[None, :]
+
+                self.vid_det[vid_name][frm_idx][label_idx] = np.concatenate(
+                    [self.vid_det[vid_name][frm_idx][label_idx], det])
+
+        return self.vid_det
+
+
+def link_tubes(anno, frm_dets, K=1, len_thre=15):
+
+    dataset = Dataset(anno, frm_dets)
+    vlist = dataset.video_list
+    total_VDets = dataset.get_vid_dets()
+
+    total_video_tubes = {label: [] for label in range(len(dataset.labels))}
+    for v in track(vlist, description='linking tubes...'):
+
+        RES = {}
+        if v not in total_VDets:
+            continue
+        VDets = total_VDets[v]
+        for ilabel in range(len(dataset.labels)):
+            FINISHED_TUBES = []
+            CURRENT_TUBES = []  # tubes is a list of tuple (frame, lstubelets)
+
+            # calculate average scores of tubelets in tubes
+
+            def tubescore(tt):
+                return np.mean(
+                    np.array([tt[i][1][-1] for i in range(len(tt))]))
+
+            for frame in range(1, dataset.nframes[v] + 2 - K):
+                # load boxes of the new frame and do nms while keeping Nkeep highest scored # noqa: E501
+                ltubelets = np.array(
+                    VDets[frame][ilabel]
+                )  # [:,range(4*K) + [4*K + 1 + ilabel]]  Nx(4K+1) with (x1 y1 x2 y2)*K ilabel-score  # noqa: E501
+
+                ltubelets = nms_tubelets(ltubelets, 0.6, top_k=10)
+
+                # just start new tubes
+                if frame == 1:
+                    for i in range(ltubelets.shape[0]):
+                        CURRENT_TUBES.append([(1, ltubelets[i, :])])
+                    continue
+
+                # sort current tubes according to average score
+                avgscore = [tubescore(t) for t in CURRENT_TUBES]
+                argsort = np.argsort(-np.array(avgscore))
+                CURRENT_TUBES = [CURRENT_TUBES[i] for i in argsort]
+                # loop over tubes
+                finished = []
+                for it, t in enumerate(CURRENT_TUBES):
+                    # compute ious between the last box of t and ltubelets
+                    last_frame, last_tubelet = t[-1]
+                    ious = []
+                    offset = frame - last_frame
+                    if offset < K:
+                        nov = K - offset
+                        ious = sum([
+                            iou2d_voc(
+                                ltubelets[:, 4 * iov:4 * iov + 4],
+                                last_tubelet[4 * (iov + offset):4 *
+                                             (iov + offset + 1)])
+                            for iov in range(nov)
+                        ]) / float(nov)
+                    else:
+                        ious = iou2d_voc(ltubelets[:, :4],
+                                         last_tubelet[4 * K - 4:4 * K])
+
+                    valid = np.where(ious >= 0.5)[0]
+
+                    if valid.size > 0:
+                        # take the one with maximum score
+                        idx = valid[np.argmax(ltubelets[valid, -1])]
+                        CURRENT_TUBES[it].append((frame, ltubelets[idx, :]))
+                        ltubelets = np.delete(ltubelets, idx, axis=0)
+                    else:
+                        if offset >= K:
+                            finished.append(it)
+
+                # finished tubes that are done
+                for it in finished[::
+                                   -1]:  # process in reverse order to delete them with the right index why --++-- # noqa: E501
+                    FINISHED_TUBES.append(CURRENT_TUBES[it][:])
+                    del CURRENT_TUBES[it]
+
+                # start new tubes
+                for i in range(ltubelets.shape[0]):
+                    CURRENT_TUBES.append([(frame, ltubelets[i, :])])
+
+            # all tubes are not finished
+            FINISHED_TUBES += CURRENT_TUBES
+
+            # build real tubes
+            output = []
+            for t in FINISHED_TUBES:
+                score = tubescore(t)
+
+                # just start new tubes
+                if score < 0.005:
+                    continue
+
+                beginframe = t[0][0]
+                endframe = t[-1][0] + K - 1
+                length = endframe + 1 - beginframe
+
+                # delete tubes with short duraton
+                if length < len_thre:
+                    continue
+
+                # build final tubes by average the tubelets
+                out = np.zeros((length, 6), dtype=np.float32)
+                out[:, 0] = np.arange(beginframe, endframe + 1)
+                n_per_frame = np.zeros((length, 1), dtype=np.int32)
+                for i in range(len(t)):
+                    frame, box = t[i]
+                    for k in range(K):
+                        out[frame - beginframe + k,
+                            1:5] += box[4 * k:4 * k + 4]
+                        out[frame - beginframe + k,
+                            -1] += box[-1]  # single frame confidence
+                        n_per_frame[frame - beginframe + k, 0] += 1
+                out[:, 1:] /= n_per_frame
+                output.append([out, score])
+                # out: [num_frames, (frame idx, x1, y1, x2, y2, score)]
+
+            RES[ilabel] = output
+            if output:
+                for tube, tube_score in output:
+                    video_tube_res = tuple([v, tube_score, tube])
+                    total_video_tubes[ilabel].append(video_tube_res)
+    return total_video_tubes
+
+
+def frameAP(GT, alldets, thr, print_info=True):
+    logger = MMLogger.get_current_instance()
+    vlist = GT['test_videos'][0]
+
+    results = {}
+    for ilabel, label in enumerate(GT['labels']):
+        # detections of this class
+        if label in [
+                'aerobic kick jump', 'aerobic off axis jump',
+                'aerobic butterfly jump', 'aerobic balance turn',
+                'basketball save', 'basketball jump ball'
+        ]:
+            if print_info:
+                logger.info('do not evaluate {}'.format(label))
+            continue
+        # det format: <video_index><frame_number><label_index><score><x1><y1><x2><y2> # noqa: E501
+        detections = alldets[alldets[:, 2] == ilabel, :]
+
+        # load ground-truth of this class
+        gt = {}
+        for iv, v in enumerate(vlist):
+            tubes = GT['gttubes'][v]
+
+            if ilabel not in tubes:
+                continue
+
+            for tube in tubes[ilabel]:
+                for i in range(tube.shape[0]):
+                    k = (iv, int(tube[i, 0]))  # k -> (video_idx, frame_idx)
+                    if k not in gt:
+                        gt[k] = []
+                    gt[k].append(tube[i, 1:5].tolist())
+
+        for k in gt:
+            gt[k] = np.array(gt[k])
+
+        # pr will be an array containing precision-recall values
+        pr = np.empty((detections.shape[0], 2),
+                      dtype=np.float64)  # precision,recall
+        gt_num = sum([g.shape[0] for g in gt.values()])
+        if gt_num == 0:
+            if print_info:
+                logger.info('no such label', ilabel, label)
+            continue
+        fp = 0  # false positives
+        tp = 0  # true positives
+
+        is_gt_box_detected = {}
+        for i, j in enumerate(np.argsort(-detections[:, 3])):
+            k = (int(detections[j, 0]), int(detections[j, 1]))
+            box = detections[j, 4:8]
+            ispositive = False
+
+            if k in gt:
+                # match gt_box according to the iou
+                if k not in is_gt_box_detected:
+                    is_gt_box_detected[k] = np.zeros(
+                        gt[k].shape[0], dtype=bool)
+                ious = iou2d_voc(gt[k], box)
+                amax = np.argmax(ious)
+
+                if ious[amax] >= thr:
+                    if not is_gt_box_detected[k][amax]:
+                        ispositive = True
+                        is_gt_box_detected[k][amax] = True
+
+            if ispositive:
+                tp += 1
+            else:
+                fp += 1
+            pr[i, 0] = float(tp) / float(tp + fp)
+            pr[i, 1] = float(tp) / float(gt_num)
+
+        results[label] = pr
+
+    # display results
+    ap = 100 * np.array([pr_to_ap_voc(results[label]) for label in results])
+    class_result = {}
+    for label in results:
+        class_result[label] = pr_to_ap_voc(results[label]) * 100
+    frameap_result = np.mean(ap)
+    if print_info:
+        logger.info('frameAP_{}\n'.format(thr))
+        for label in class_result:
+            logger.info('{:20s} {:8.2f}'.format(label, class_result[label]))
+        logger.info('{:20s} {:8.2f}'.format('mAP', frameap_result))
+    return frameap_result
+
+
+def videoAP(GT, alldets, thr, print_info=True):
+    logger = MMLogger.get_current_instance()
+    vlist = GT['test_videos'][0]
+
+    res = {}
+    for ilabel in range(len(GT['labels'])):
+        if GT['labels'][ilabel] in [
+                'aerobic kick jump', 'aerobic off axis jump',
+                'aerobic butterfly jump', 'aerobic balance turn',
+                'basketball save', 'basketball jump ball'
+        ]:
+            if print_info:
+                logger.info('do not evaluate{}'.format(GT['labels'][ilabel]))
+            continue
+        detections = alldets[ilabel]
+        # load ground-truth
+        gt = {}
+        for v in vlist:
+            tubes = GT['gttubes'][v]
+
+            if ilabel not in tubes:
+                continue
+
+            gt[v] = tubes[ilabel]
+
+            if len(gt[v]) == 0:
+                del gt[v]
+
+        # precision,recall
+        pr = np.empty((len(detections), 2), dtype=np.float64)
+
+        gt_num = sum([len(g) for g in gt.values()])  # false negatives
+        fp = 0  # false positives
+        tp = 0  # true positives
+        if gt_num == 0:
+            if print_info:
+                logger.info('no such label', ilabel, GT['labels'][ilabel])
+            continue
+        is_gt_box_detected = {}
+        for i, j in enumerate(
+                np.argsort(-np.array([dd[1] for dd in detections]))):
+            v, score, tube = detections[j]
+            ispositive = False
+            if v in gt:
+                if v not in is_gt_box_detected:
+                    is_gt_box_detected[v] = np.zeros(len(gt[v]), dtype=bool)
+                ious = [iou3dt_voc(g, tube) for g in gt[v]]
+                amax = np.argmax(ious)
+                if ious[amax] >= thr:
+                    if not is_gt_box_detected[v][amax]:
+                        ispositive = True
+                        is_gt_box_detected[v][amax] = True
+
+            if ispositive:
+                tp += 1
+            else:
+                fp += 1
+
+            pr[i, 0] = float(tp) / float(tp + fp)
+            pr[i, 1] = float(tp) / float(gt_num)
+        res[GT['labels'][ilabel]] = pr
+
+    # display results
+    ap = 100 * np.array([pr_to_ap_voc(res[label]) for label in res])
+    videoap_result = np.mean(ap)
+    class_result = {}
+    for label in res:
+        class_result[label] = pr_to_ap_voc(res[label]) * 100
+    if print_info:
+        logger.info('VideoAP_{}\n'.format(thr))
+        for label in class_result:
+            logger.info('{:20s} {:8.2f}'.format(label, class_result[label]))
+        logger.info('{:20s} {:8.2f}'.format('mAP', videoap_result))
+    return videoap_result
+
+
+def videoAP_all(groundtruth, detections):
+    high_ap = 0
+    for i in range(10):
+        thr = 0.5 + 0.05 * i
+        high_ap += videoAP(groundtruth, detections, thr, print_info=False)
+    high_ap = high_ap / 10.0
+
+    low_ap = 0
+    for i in range(9):
+        thr = 0.05 + 0.05 * i
+        low_ap += videoAP(groundtruth, detections, thr, print_info=False)
+    low_ap = low_ap / 9.0
+
+    all_ap = 0
+    for i in range(9):
+        thr = 0.1 + 0.1 * i
+        all_ap += videoAP(groundtruth, detections, thr, print_info=False)
+    all_ap = all_ap / 9.0
+
+    map = {
+        'v_map_0.05:0.45': round(low_ap, 4),
+        'v_map_0.10:0.90': round(all_ap, 4),
+        'v_map_0.50:0.95': round(high_ap, 4),
+    }
+    return map
+
+
+def videoAP_error(GT, alldets, thr):
+
+    vlist = GT['test_videos'][0]
+
+    th_s = math.sqrt(thr)
+    th_t = math.sqrt(thr)
+
+    print('th is', thr)
+    print('th_s is', th_s)
+    print('th_t is', th_t)
+
+    res = {}
+    dupgt = {}
+    for v in vlist:
+        dupgt[v] = GT['gttubes'][v]
+    # compute video error for every class
+    for ilabel in range(len(GT['labels'])):
+        if GT['labels'][ilabel] in [
+                'aerobic kick jump', 'aerobic off axis jump',
+                'aerobic butterfly jump', 'aerobic balance turn',
+                'basketball save', 'basketball jump ball'
+        ]:
+            print('do not evaluate {}'.format(GT['labels'][ilabel]))
+            continue
+        detections = alldets[ilabel]
+
+        pr = np.zeros((len(detections), 11), dtype=np.float32)
+
+        gt_num = 0
+        for v in dupgt:
+            if ilabel in dupgt[v]:
+                gt_num = gt_num + len(dupgt[v][ilabel])
+        fp = 0  # false positives
+        tp = 0  # true positives
+        ER = 0  # repeat error repeat predict for the same instance
+        EN = 0  # extra error
+        EL = 0  # localization errors
+        EC = 0  # classification error
+        ET = 0  # timing error
+        ErrCT = 0  # cls + time
+        ECL = 0  # cls + loc
+        ETL = 0  # time + loc
+        ECTL = 0  # cls + time + loc
+
+        is_gt_box_detected = {}
+        for i, j in enumerate(
+                np.argsort(-np.array([dd[1] for dd in detections]))):
+            v, score, tube = detections[j]
+            ispositive = False
+            end = False
+            if ilabel in dupgt[v]:
+                if v not in is_gt_box_detected:
+                    is_gt_box_detected[v] = np.zeros(
+                        len(dupgt[v][ilabel]), dtype=bool)
+                ious = [iou3dt_voc(g, tube) for g in dupgt[v][ilabel]]
+                amax = np.argmax(ious)
+                if ious[amax] >= thr:
+                    if not is_gt_box_detected[v][amax]:
+                        ispositive = True
+                        is_gt_box_detected[v][amax] = True
+                    else:
+                        ER += 1
+                    end = True
+            if end is False:
+                ious = []
+                for ll in dupgt[v]:
+                    if ll == ilabel:
+                        continue
+                    for g in dupgt[v][ll]:
+                        ious.append(iou3dt_voc(g, tube))
+                if ious != []:
+                    amax = np.argmax(ious)
+                    if ious[amax] >= thr:
+                        EC += 1
+                        end = True
+            if end is False:
+                all_gt = []
+                ious = []
+                for ll in dupgt[v]:
+                    for g in dupgt[v][ll]:
+                        all_gt.append((ll, g))
+                        ious.append(iou3dt_voc(g, tube))
+                amax = np.argmax(ious)
+                assert (ious[amax] < thr)
+                if ious[amax] > 0:
+                    t_iou = iou3dt_voc(
+                        all_gt[amax][1], tube, temporalonly=True)
+                    s_iou = iou3dt_voc(all_gt[amax][1], tube, spatialonly=True)
+                    if all_gt[amax][0] == ilabel:
+                        assert (t_iou < th_t or s_iou < th_s)
+                        if t_iou >= th_t:
+                            EL += 1
+                            end = True
+                        elif s_iou >= th_s:
+                            ET += 1
+                            end = True
+                        else:
+                            ETL += 1
+                            end = True
+                    else:
+                        assert (t_iou < th_t or s_iou < th_s)
+                        if t_iou >= th_t:
+                            ECL += 1
+                            end = True
+                        elif s_iou >= th_s:
+                            ErrCT += 1
+                            end = True
+                        else:
+                            ECTL += 1
+                            end = True
+                else:
+                    EN += 1
+                    end = True
+            assert (end is True)
+            if ispositive:
+                tp += 1
+                # fn -= 1
+            else:
+                fp += 1
+            assert (fp == (ER + EN + EL + EC + ET + ErrCT + ECL + ETL + ECTL))
+            pr[i, 0] = max(float(tp) / float(tp + fp), 0.)
+            pr[i, 1] = max(float(tp) / float(gt_num), 0.)
+            pr[i, 2] = max(float(ER) / float(tp + fp), 0.)
+            pr[i, 3] = max(float(EN) / float(tp + fp), 0.)
+            pr[i, 4] = max(float(EL) / float(tp + fp), 0.)
+            pr[i, 5] = max(float(EC) / float(tp + fp), 0.)
+            pr[i, 6] = max(float(ET) / float(tp + fp), 0.)
+            pr[i, 7] = max(float(ErrCT) / float(tp + fp), 0.)
+            pr[i, 8] = max(float(ECL) / float(tp + fp), 0.)
+            pr[i, 9] = max(float(ETL) / float(tp + fp), 0.)
+            pr[i, 10] = max(float(ECTL) / float(tp + fp), 0.)
+
+        res[GT['labels'][ilabel]] = pr
+
+    # display results
+    AP = 100 * np.array([pr_to_ap_voc(res[label][:, [0, 1]]) for label in res])
+    othersap = [
+        100 * np.array([pr_to_ap_voc(res[label][:, [j, 1]]) for label in res])
+        for j in range(2, 11)
+    ]
+
+    ER = othersap[0]
+    EN = othersap[1]
+    EL = othersap[2]
+    EC = othersap[3]
+    ET = othersap[4]
+    ErrCT = othersap[5]
+    ECL = othersap[6]
+    ETL = othersap[7]
+    ECTL = othersap[8]
+    # missed detections = 1-recalll
+    EM = []
+    for label in res:
+        if res[label].shape[0] != 0:
+            EM.append(100 - 100 * res[label][-1, 1])
+        else:
+            EM.append(100)
+    EM = np.array(EM)
+
+    LIST = [AP, ER, EN, EL, EC, ET, ErrCT, ECL, ETL, ECTL, EM]
+
+    print('Error Analysis')
+
+    print('')
+    print(
+        '{:20s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s}'  # noqa: E501
+        .format('label', '  AP ', '  Repeat ', ' Extra ', ' Loc. ', ' Cls. ',
+                ' Time ', ' Cls.+Time ', ' Cls.+Loc. ', ' Time+Loc. ',
+                ' C+T+L ', ' missed '))
+    print('')
+    for il, label in enumerate(res):
+        print('{:20s} '.format(label) +
+              ' '.join(['{:8.2f}'.format(L[il]) for L in LIST]))
+    print('')
+    print('{:20s} '.format('mean') +
+          ' '.join(['{:8.2f}'.format(np.mean(L)) for L in LIST]))
+    print('')
diff --git a/mmaction/evaluation/metrics/__init__.py b/mmaction/evaluation/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f864f243029813aed0ff2b2659ecbbeace55162a
--- /dev/null
+++ b/mmaction/evaluation/metrics/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .acc_metric import AccMetric, ConfusionMatrix
+from .anet_metric import ANetMetric
+from .ava_metric import AVAMetric
+from .multimodal_metric import VQAMCACC, ReportVQA, RetrievalRecall, VQAAcc
+from .multisports_metric import MultiSportsMetric
+from .retrieval_metric import RetrievalMetric
+from .video_grounding_metric import RecallatTopK
+
+__all__ = [
+    'AccMetric', 'AVAMetric', 'ANetMetric', 'ConfusionMatrix',
+    'MultiSportsMetric', 'RetrievalMetric', 'VQAAcc', 'ReportVQA', 'VQAMCACC',
+    'RetrievalRecall', 'RecallatTopK'
+]
diff --git a/mmaction/evaluation/metrics/__pycache__/__init__.cpython-312.pyc b/mmaction/evaluation/metrics/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02063a43e0916f85480aa53703877d21c6adaae8
Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/evaluation/metrics/__pycache__/acc_metric.cpython-312.pyc b/mmaction/evaluation/metrics/__pycache__/acc_metric.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b1852927902c8a57a6b7644dd0db24d7b7378b5
Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/acc_metric.cpython-312.pyc differ
diff --git a/mmaction/evaluation/metrics/__pycache__/anet_metric.cpython-312.pyc b/mmaction/evaluation/metrics/__pycache__/anet_metric.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..980195e54fcf5f2e70345508508a3c6d2a0509c3
Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/anet_metric.cpython-312.pyc differ
diff --git a/mmaction/evaluation/metrics/__pycache__/ava_metric.cpython-312.pyc b/mmaction/evaluation/metrics/__pycache__/ava_metric.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..468b31bb53aade68ef499f27489cbf0613fa22eb
Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/ava_metric.cpython-312.pyc differ
diff --git a/mmaction/evaluation/metrics/__pycache__/multimodal_metric.cpython-312.pyc b/mmaction/evaluation/metrics/__pycache__/multimodal_metric.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77b32c803c3a909e654177e4774f1e9e85ad1dc5
Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/multimodal_metric.cpython-312.pyc differ
diff --git a/mmaction/evaluation/metrics/__pycache__/multisports_metric.cpython-312.pyc b/mmaction/evaluation/metrics/__pycache__/multisports_metric.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca8542e0c1d4d6a76c67b923e2c63830362c56ea
Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/multisports_metric.cpython-312.pyc differ
diff --git a/mmaction/evaluation/metrics/__pycache__/retrieval_metric.cpython-312.pyc b/mmaction/evaluation/metrics/__pycache__/retrieval_metric.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c1db097095e4d5bdab3ec98f18182331cff1e82
Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/retrieval_metric.cpython-312.pyc differ
diff --git a/mmaction/evaluation/metrics/__pycache__/video_grounding_metric.cpython-312.pyc b/mmaction/evaluation/metrics/__pycache__/video_grounding_metric.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30deaa1a9f0d12e523e8e09628e55a439ff71168
Binary files /dev/null and b/mmaction/evaluation/metrics/__pycache__/video_grounding_metric.cpython-312.pyc differ
diff --git a/mmaction/evaluation/metrics/acc_metric.py b/mmaction/evaluation/metrics/acc_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..91020fbcccd0e7a4824a29b296eb9129f5d10ec8
--- /dev/null
+++ b/mmaction/evaluation/metrics/acc_metric.py
@@ -0,0 +1,387 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from collections import OrderedDict
+from itertools import product
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+
+from mmaction.evaluation import (get_weighted_score, mean_average_precision,
+                                 mean_class_accuracy,
+                                 mmit_mean_average_precision, top_k_accuracy)
+from mmaction.registry import METRICS
+
+
+def to_tensor(value):
+    """Convert value to torch.Tensor."""
+    if isinstance(value, np.ndarray):
+        value = torch.from_numpy(value)
+    elif isinstance(value, Sequence) and not mmengine.is_str(value):
+        value = torch.tensor(value)
+    elif not isinstance(value, torch.Tensor):
+        raise TypeError(f'{type(value)} is not an available argument.')
+    return value
+
+
+@METRICS.register_module()
+class AccMetric(BaseMetric):
+    """Accuracy evaluation metric."""
+    default_prefix: Optional[str] = 'acc'
+
+    def __init__(self,
+                 metric_list: Optional[Union[str, Tuple[str]]] = (
+                     'top_k_accuracy', 'mean_class_accuracy'),
+                 collect_device: str = 'cpu',
+                 metric_options: Optional[Dict] = dict(
+                     top_k_accuracy=dict(topk=(1, 5))),
+                 prefix: Optional[str] = None) -> None:
+
+        # TODO: fix the metric_list argument with a better one.
+        # `metrics` is not a safe argument here with mmengine.
+        # we have to replace it with `metric_list`.
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        if not isinstance(metric_list, (str, tuple)):
+            raise TypeError('metric_list must be str or tuple of str, '
+                            f'but got {type(metric_list)}')
+
+        if isinstance(metric_list, str):
+            metrics = (metric_list, )
+        else:
+            metrics = metric_list
+
+        # coco evaluation metrics
+        for metric in metrics:
+            assert metric in [
+                'top_k_accuracy', 'mean_class_accuracy',
+                'mmit_mean_average_precision', 'mean_average_precision'
+            ]
+
+        self.metrics = metrics
+        self.metric_options = metric_options
+
+    def process(self, data_batch: Sequence[Tuple[Any, Dict]],
+                data_samples: Sequence[Dict]) -> None:
+        """Process one batch of data samples and data_samples. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (Sequence[dict]): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        data_samples = copy.deepcopy(data_samples)
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_score']
+            label = data_sample['gt_label']
+
+            # Ad-hoc for RGBPoseConv3D
+            if isinstance(pred, dict):
+                for item_name, score in pred.items():
+                    pred[item_name] = score.cpu().numpy()
+            else:
+                pred = pred.cpu().numpy()
+
+            result['pred'] = pred
+            if label.size(0) == 1:
+                # single-label
+                result['label'] = label.item()
+            else:
+                # multi-label
+                result['label'] = label.cpu().numpy()
+            self.results.append(result)
+
+    def compute_metrics(self, results: List) -> Dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        labels = [x['label'] for x in results]
+
+        eval_results = dict()
+        # Ad-hoc for RGBPoseConv3D
+        if isinstance(results[0]['pred'], dict):
+
+            for item_name in results[0]['pred'].keys():
+                preds = [x['pred'][item_name] for x in results]
+                eval_result = self.calculate(preds, labels)
+                eval_results.update(
+                    {f'{item_name}_{k}': v
+                     for k, v in eval_result.items()})
+
+            if len(results[0]['pred']) == 2 and \
+                    'rgb' in results[0]['pred'] and \
+                    'pose' in results[0]['pred']:
+
+                rgb = [x['pred']['rgb'] for x in results]
+                pose = [x['pred']['pose'] for x in results]
+
+                preds = {
+                    '1:1': get_weighted_score([rgb, pose], [1, 1]),
+                    '2:1': get_weighted_score([rgb, pose], [2, 1]),
+                    '1:2': get_weighted_score([rgb, pose], [1, 2])
+                }
+                for k in preds:
+                    eval_result = self.calculate(preds[k], labels)
+                    eval_results.update({
+                        f'RGBPose_{k}_{key}': v
+                        for key, v in eval_result.items()
+                    })
+            return eval_results
+
+        # Simple Acc Calculation
+        else:
+            preds = [x['pred'] for x in results]
+            return self.calculate(preds, labels)
+
+    def calculate(self, preds: List[np.ndarray],
+                  labels: List[Union[int, np.ndarray]]) -> Dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            preds (list[np.ndarray]): List of the prediction scores.
+            labels (list[int | np.ndarray]): List of the labels.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        eval_results = OrderedDict()
+        metric_options = copy.deepcopy(self.metric_options)
+        for metric in self.metrics:
+            if metric == 'top_k_accuracy':
+                topk = metric_options.setdefault('top_k_accuracy',
+                                                 {}).setdefault(
+                                                     'topk', (1, 5))
+
+                if not isinstance(topk, (int, tuple)):
+                    raise TypeError('topk must be int or tuple of int, '
+                                    f'but got {type(topk)}')
+
+                if isinstance(topk, int):
+                    topk = (topk, )
+
+                top_k_acc = top_k_accuracy(preds, labels, topk)
+                for k, acc in zip(topk, top_k_acc):
+                    eval_results[f'top{k}'] = acc
+
+            if metric == 'mean_class_accuracy':
+                mean1 = mean_class_accuracy(preds, labels)
+                eval_results['mean1'] = mean1
+
+            if metric in [
+                    'mean_average_precision',
+                    'mmit_mean_average_precision',
+            ]:
+                if metric == 'mean_average_precision':
+                    mAP = mean_average_precision(preds, labels)
+                    eval_results['mean_average_precision'] = mAP
+
+                elif metric == 'mmit_mean_average_precision':
+                    mAP = mmit_mean_average_precision(preds, labels)
+                    eval_results['mmit_mean_average_precision'] = mAP
+
+        return eval_results
+
+
+@METRICS.register_module()
+class ConfusionMatrix(BaseMetric):
+    r"""A metric to calculate confusion matrix for single-label tasks.
+
+    Args:
+        num_classes (int, optional): The number of classes. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+
+    Examples:
+
+        1. The basic usage.
+
+        >>> import torch
+        >>> from mmaction.evaluation import ConfusionMatrix
+        >>> y_pred = [0, 1, 1, 3]
+        >>> y_true = [0, 2, 1, 3]
+        >>> ConfusionMatrix.calculate(y_pred, y_true, num_classes=4)
+        tensor([[1, 0, 0, 0],
+                [0, 1, 0, 0],
+                [0, 1, 0, 0],
+                [0, 0, 0, 1]])
+        >>> # plot the confusion matrix
+        >>> import matplotlib.pyplot as plt
+        >>> y_score = torch.rand((1000, 10))
+        >>> y_true = torch.randint(10, (1000, ))
+        >>> matrix = ConfusionMatrix.calculate(y_score, y_true)
+        >>> ConfusionMatrix().plot(matrix)
+        >>> plt.show()
+
+        2. In the config file
+
+        .. code:: python
+
+            val_evaluator = dict(type='ConfusionMatrix')
+            test_evaluator = dict(type='ConfusionMatrix')
+    """  # noqa: E501
+    default_prefix = 'confusion_matrix'
+
+    def __init__(self,
+                 num_classes: Optional[int] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device, prefix)
+
+        self.num_classes = num_classes
+
+    def process(self, data_batch, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            pred_scores = data_sample.get('pred_score')
+            gt_label = data_sample['gt_label']
+            if pred_scores is not None:
+                pred_label = pred_scores.argmax(dim=0, keepdim=True)
+                self.num_classes = pred_scores.size(0)
+            else:
+                pred_label = data_sample['pred_label']
+
+            self.results.append({
+                'pred_label': pred_label,
+                'gt_label': gt_label
+            })
+
+    def compute_metrics(self, results: list) -> dict:
+        pred_labels = []
+        gt_labels = []
+        for result in results:
+            pred_labels.append(result['pred_label'])
+            gt_labels.append(result['gt_label'])
+        confusion_matrix = ConfusionMatrix.calculate(
+            torch.cat(pred_labels),
+            torch.cat(gt_labels),
+            num_classes=self.num_classes)
+        return {'result': confusion_matrix}
+
+    @staticmethod
+    def calculate(pred, target, num_classes=None) -> dict:
+        """Calculate the confusion matrix for single-label task.
+
+        Args:
+            pred (torch.Tensor | np.ndarray | Sequence): The prediction
+                results. It can be labels (N, ), or scores of every
+                class (N, C).
+            target (torch.Tensor | np.ndarray | Sequence): The target of
+                each prediction with shape (N, ).
+            num_classes (Optional, int): The number of classes. If the ``pred``
+                is label instead of scores, this argument is required.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: The confusion matrix.
+        """
+        pred = to_tensor(pred)
+        target_label = to_tensor(target).int()
+
+        assert pred.size(0) == target_label.size(0), \
+            f"The size of pred ({pred.size(0)}) doesn't match "\
+            f'the target ({target_label.size(0)}).'
+        assert target_label.ndim == 1
+
+        if pred.ndim == 1:
+            assert num_classes is not None, \
+                'Please specify the `num_classes` if the `pred` is labels ' \
+                'intead of scores.'
+            pred_label = pred
+        else:
+            num_classes = num_classes or pred.size(1)
+            pred_label = torch.argmax(pred, dim=1).flatten()
+
+        with torch.no_grad():
+            indices = num_classes * target_label + pred_label
+            matrix = torch.bincount(indices, minlength=num_classes**2)
+            matrix = matrix.reshape(num_classes, num_classes)
+
+        return matrix
+
+    @staticmethod
+    def plot(confusion_matrix: torch.Tensor,
+             include_values: bool = False,
+             cmap: str = 'viridis',
+             classes: Optional[List[str]] = None,
+             colorbar: bool = True,
+             show: bool = True):
+        """Draw a confusion matrix by matplotlib.
+
+        Modified from `Scikit-Learn
+        <https://github.com/scikit-learn/scikit-learn/blob/dc580a8ef/sklearn/metrics/_plot/confusion_matrix.py#L81>`_
+
+        Args:
+            confusion_matrix (torch.Tensor): The confusion matrix to draw.
+            include_values (bool): Whether to draw the values in the figure.
+                Defaults to False.
+            cmap (str): The color map to use. Defaults to use "viridis".
+            classes (list[str], optional): The names of categories.
+                Defaults to None, which means to use index number.
+            colorbar (bool): Whether to show the colorbar. Defaults to True.
+            show (bool): Whether to show the figure immediately.
+                Defaults to True.
+        """  # noqa: E501
+        import matplotlib.pyplot as plt
+
+        fig, ax = plt.subplots(figsize=(10, 10))
+
+        num_classes = confusion_matrix.size(0)
+
+        im_ = ax.imshow(confusion_matrix, interpolation='nearest', cmap=cmap)
+        text_ = None
+        cmap_min, cmap_max = im_.cmap(0), im_.cmap(1.0)
+
+        if include_values:
+            text_ = np.empty_like(confusion_matrix, dtype=object)
+
+            # print text with appropriate color depending on background
+            thresh = (confusion_matrix.max() + confusion_matrix.min()) / 2.0
+
+            for i, j in product(range(num_classes), range(num_classes)):
+                color = cmap_max if confusion_matrix[i,
+                                                     j] < thresh else cmap_min
+
+                text_cm = format(confusion_matrix[i, j], '.2g')
+                text_d = format(confusion_matrix[i, j], 'd')
+                if len(text_d) < len(text_cm):
+                    text_cm = text_d
+
+                text_[i, j] = ax.text(
+                    j, i, text_cm, ha='center', va='center', color=color)
+
+        display_labels = classes or np.arange(num_classes)
+
+        if colorbar:
+            fig.colorbar(im_, ax=ax)
+        ax.set(
+            xticks=np.arange(num_classes),
+            yticks=np.arange(num_classes),
+            xticklabels=display_labels,
+            yticklabels=display_labels,
+            ylabel='True label',
+            xlabel='Predicted label',
+        )
+        ax.invert_yaxis()
+        ax.xaxis.tick_top()
+
+        ax.set_ylim((num_classes - 0.5, -0.5))
+        # Automatically rotate the x labels.
+        fig.autofmt_xdate(ha='center')
+
+        if show:
+            plt.show()
+        return fig
diff --git a/mmaction/evaluation/metrics/anet_metric.py b/mmaction/evaluation/metrics/anet_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..c569eca15584e8cdd001b61abf86fbdda58183e5
--- /dev/null
+++ b/mmaction/evaluation/metrics/anet_metric.py
@@ -0,0 +1,172 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from collections import OrderedDict
+from typing import Any, Optional, Sequence, Tuple
+
+import mmcv
+import mmengine
+import numpy as np
+from mmengine.evaluator import BaseMetric
+
+from mmaction.evaluation import average_recall_at_avg_proposals
+from mmaction.registry import METRICS
+from mmaction.utils import ConfigType
+
+
+@METRICS.register_module()
+class ANetMetric(BaseMetric):
+    """ActivityNet dataset evaluation metric."""
+
+    def __init__(self,
+                 metric_type: str = 'TEM',
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 metric_options: dict = {},
+                 dump_config: ConfigType = dict(out='')):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.metric_type = metric_type
+
+        assert 'out' in dump_config
+        self.output_format = dump_config.pop('output_format', 'csv')
+        self.out = dump_config['out']
+
+        self.metric_options = metric_options
+        if self.metric_type == 'AR@AN':
+            self.ground_truth = {}
+
+    def process(self, data_batch: Sequence[Tuple[Any, dict]],
+                predictions: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (Sequence[Tuple[Any, dict]]): A batch of data
+                from the dataloader.
+            predictions (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+        for pred in predictions:
+            self.results.append(pred)
+
+        if self.metric_type == 'AR@AN':
+            data_batch = data_batch['data_samples']
+            for data_sample in data_batch:
+                video_info = data_sample.metainfo
+                video_id = video_info['video_name'][2:]
+                this_video_gt = []
+                for ann in video_info['annotations']:
+                    t_start, t_end = ann['segment']
+                    label = ann['label']
+                    this_video_gt.append([t_start, t_end, label])
+                self.ground_truth[video_id] = np.array(this_video_gt)
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        If `metric_type` is 'TEM', only dump middle results and do not compute
+        any metrics.
+        Args:
+            results (list): The processed results of each batch.
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        self.dump_results(results)
+        if self.metric_type == 'AR@AN':
+            return self.compute_ARAN(results)
+        return OrderedDict()
+
+    def compute_ARAN(self, results: list) -> dict:
+        """AR@AN evaluation metric."""
+        temporal_iou_thresholds = self.metric_options.setdefault(
+            'AR@AN', {}).setdefault('temporal_iou_thresholds',
+                                    np.linspace(0.5, 0.95, 10))
+        max_avg_proposals = self.metric_options.setdefault(
+            'AR@AN', {}).setdefault('max_avg_proposals', 100)
+        if isinstance(temporal_iou_thresholds, list):
+            temporal_iou_thresholds = np.array(temporal_iou_thresholds)
+
+        eval_results = OrderedDict()
+        proposal, num_proposals = self._import_proposals(results)
+
+        recall, _, _, auc = average_recall_at_avg_proposals(
+            self.ground_truth,
+            proposal,
+            num_proposals,
+            max_avg_proposals=max_avg_proposals,
+            temporal_iou_thresholds=temporal_iou_thresholds)
+        eval_results['auc'] = auc
+        eval_results['AR@1'] = np.mean(recall[:, 0])
+        eval_results['AR@5'] = np.mean(recall[:, 4])
+        eval_results['AR@10'] = np.mean(recall[:, 9])
+        eval_results['AR@100'] = np.mean(recall[:, 99])
+
+        return eval_results
+
+    def dump_results(self, results, version='VERSION 1.3'):
+        """Save middle or final results to disk."""
+        if self.output_format == 'json':
+            result_dict = self.proposals2json(results)
+            output_dict = {
+                'version': version,
+                'results': result_dict,
+                'external_data': {}
+            }
+            mmengine.dump(output_dict, self.out)
+        elif self.output_format == 'csv':
+            os.makedirs(self.out, exist_ok=True)
+            header = 'action,start,end,tmin,tmax'
+            for result in results:
+                video_name, outputs = result
+                output_path = osp.join(self.out, video_name + '.csv')
+                np.savetxt(
+                    output_path,
+                    outputs,
+                    header=header,
+                    delimiter=',',
+                    comments='')
+        else:
+            raise ValueError(
+                f'The output format {self.output_format} is not supported.')
+
+    @staticmethod
+    def proposals2json(results, show_progress=False):
+        """Convert all proposals to a final dict(json) format.
+        Args:
+            results (list[dict]): All proposals.
+            show_progress (bool): Whether to show the progress bar.
+                Defaults: False.
+        Returns:
+            dict: The final result dict. E.g.
+            .. code-block:: Python
+                dict(video-1=[dict(segment=[1.1,2.0]. score=0.9),
+                              dict(segment=[50.1, 129.3], score=0.6)])
+        """
+        result_dict = {}
+        print('Convert proposals to json format')
+        if show_progress:
+            prog_bar = mmcv.ProgressBar(len(results))
+        for result in results:
+            video_name = result['video_name']
+            result_dict[video_name[2:]] = result['proposal_list']
+            if show_progress:
+                prog_bar.update()
+        return result_dict
+
+    @staticmethod
+    def _import_proposals(results):
+        """Read predictions from results."""
+        proposals = {}
+        num_proposals = 0
+        for result in results:
+            video_id = result['video_name'][2:]
+            this_video_proposals = []
+            for proposal in result['proposal_list']:
+                t_start, t_end = proposal['segment']
+                score = proposal['score']
+                this_video_proposals.append([t_start, t_end, score])
+                num_proposals += 1
+            proposals[video_id] = np.array(this_video_proposals)
+        return proposals, num_proposals
diff --git a/mmaction/evaluation/metrics/ava_metric.py b/mmaction/evaluation/metrics/ava_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..470309697e8c9a8be08bcc653f76957ae06ef4df
--- /dev/null
+++ b/mmaction/evaluation/metrics/ava_metric.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from datetime import datetime
+from typing import Any, List, Optional, Sequence, Tuple
+
+from mmengine.evaluator import BaseMetric
+
+from mmaction.evaluation import ava_eval, results2csv
+from mmaction.registry import METRICS
+from mmaction.structures import bbox2result
+
+
+@METRICS.register_module()
+class AVAMetric(BaseMetric):
+    """AVA evaluation metric."""
+    default_prefix: Optional[str] = 'mAP'
+
+    def __init__(self,
+                 ann_file: str,
+                 exclude_file: str,
+                 label_file: str,
+                 options: Tuple[str] = ('mAP', ),
+                 action_thr: float = 0.002,
+                 num_classes: int = 81,
+                 custom_classes: Optional[List[int]] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        assert len(options) == 1
+        self.ann_file = ann_file
+        self.exclude_file = exclude_file
+        self.label_file = label_file
+        self.num_classes = num_classes
+        self.options = options
+        self.action_thr = action_thr
+        self.custom_classes = custom_classes
+        if custom_classes is not None:
+            self.custom_classes = list([0] + custom_classes)
+
+    def process(self, data_batch: Sequence[Tuple[Any, dict]],
+                data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (Sequence[Tuple[Any, dict]]): A batch of data
+                from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['video_id'] = data_sample['video_id']
+            result['timestamp'] = data_sample['timestamp']
+            outputs = bbox2result(
+                pred['bboxes'],
+                pred['scores'],
+                num_classes=self.num_classes,
+                thr=self.action_thr)
+            result['outputs'] = outputs
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        time_now = datetime.now().strftime('%Y%m%d_%H%M%S')
+        temp_file = f'AVA_{time_now}_result.csv'
+        results2csv(results, temp_file, self.custom_classes)
+
+        eval_results = ava_eval(
+            temp_file,
+            self.options[0],
+            self.label_file,
+            self.ann_file,
+            self.exclude_file,
+            ignore_empty_frames=True,
+            custom_classes=self.custom_classes)
+
+        os.remove(temp_file)
+
+        return eval_results
diff --git a/mmaction/evaluation/metrics/multimodal_metric.py b/mmaction/evaluation/metrics/multimodal_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..433d65bcf93a2dd8972933adb05f3d4a03991a92
--- /dev/null
+++ b/mmaction/evaluation/metrics/multimodal_metric.py
@@ -0,0 +1,565 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copied from mmpretrain
+# Partly adopted from https://github.com/GT-Vision-Lab/VQA
+# Copyright (c) 2014, Aishwarya Agrawal
+from typing import List, Optional, Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+from mmengine.utils import is_seq_of
+
+from mmaction.registry import METRICS
+from mmaction.structures.action_data_sample import format_label
+from .acc_metric import to_tensor
+
+
+def _process_punctuation(inText):
+    import re
+    outText = inText
+    punct = [
+        ';', r'/', '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-',
+        '>', '<', '@', '`', ',', '?', '!'
+    ]
+    commaStrip = re.compile('(\d)(,)(\d)')  # noqa: W605
+    periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')  # noqa: W605
+    for p in punct:
+        if (p + ' ' in inText or ' ' + p in inText) or (re.search(
+                commaStrip, inText) is not None):
+            outText = outText.replace(p, '')
+        else:
+            outText = outText.replace(p, ' ')
+    outText = periodStrip.sub('', outText, re.UNICODE)
+    return outText
+
+
+def _process_digit_article(inText):
+    outText = []
+    tempText = inText.lower().split()
+    articles = ['a', 'an', 'the']
+    manualMap = {
+        'none': '0',
+        'zero': '0',
+        'one': '1',
+        'two': '2',
+        'three': '3',
+        'four': '4',
+        'five': '5',
+        'six': '6',
+        'seven': '7',
+        'eight': '8',
+        'nine': '9',
+        'ten': '10',
+    }
+    contractions = {
+        'aint': "ain't",
+        'arent': "aren't",
+        'cant': "can't",
+        'couldve': "could've",
+        'couldnt': "couldn't",
+        "couldn'tve": "couldn't've",
+        "couldnt've": "couldn't've",
+        'didnt': "didn't",
+        'doesnt': "doesn't",
+        'dont': "don't",
+        'hadnt': "hadn't",
+        "hadnt've": "hadn't've",
+        "hadn'tve": "hadn't've",
+        'hasnt': "hasn't",
+        'havent': "haven't",
+        'hed': "he'd",
+        "hed've": "he'd've",
+        "he'dve": "he'd've",
+        'hes': "he's",
+        'howd': "how'd",
+        'howll': "how'll",
+        'hows': "how's",
+        "Id've": "I'd've",
+        "I'dve": "I'd've",
+        'Im': "I'm",
+        'Ive': "I've",
+        'isnt': "isn't",
+        'itd': "it'd",
+        "itd've": "it'd've",
+        "it'dve": "it'd've",
+        'itll': "it'll",
+        "let's": "let's",
+        'maam': "ma'am",
+        'mightnt': "mightn't",
+        "mightnt've": "mightn't've",
+        "mightn'tve": "mightn't've",
+        'mightve': "might've",
+        'mustnt': "mustn't",
+        'mustve': "must've",
+        'neednt': "needn't",
+        'notve': "not've",
+        'oclock': "o'clock",
+        'oughtnt': "oughtn't",
+        "ow's'at": "'ow's'at",
+        "'ows'at": "'ow's'at",
+        "'ow'sat": "'ow's'at",
+        'shant': "shan't",
+        "shed've": "she'd've",
+        "she'dve": "she'd've",
+        "she's": "she's",
+        'shouldve': "should've",
+        'shouldnt': "shouldn't",
+        "shouldnt've": "shouldn't've",
+        "shouldn'tve": "shouldn't've",
+        "somebody'd": 'somebodyd',
+        "somebodyd've": "somebody'd've",
+        "somebody'dve": "somebody'd've",
+        'somebodyll': "somebody'll",
+        'somebodys': "somebody's",
+        'someoned': "someone'd",
+        "someoned've": "someone'd've",
+        "someone'dve": "someone'd've",
+        'someonell': "someone'll",
+        'someones': "someone's",
+        'somethingd': "something'd",
+        "somethingd've": "something'd've",
+        "something'dve": "something'd've",
+        'somethingll': "something'll",
+        'thats': "that's",
+        'thered': "there'd",
+        "thered've": "there'd've",
+        "there'dve": "there'd've",
+        'therere': "there're",
+        'theres': "there's",
+        'theyd': "they'd",
+        "theyd've": "they'd've",
+        "they'dve": "they'd've",
+        'theyll': "they'll",
+        'theyre': "they're",
+        'theyve': "they've",
+        'twas': "'twas",
+        'wasnt': "wasn't",
+        "wed've": "we'd've",
+        "we'dve": "we'd've",
+        'weve': "we've",
+        'werent': "weren't",
+        'whatll': "what'll",
+        'whatre': "what're",
+        'whats': "what's",
+        'whatve': "what've",
+        'whens': "when's",
+        'whered': "where'd",
+        'wheres': "where's",
+        'whereve': "where've",
+        'whod': "who'd",
+        "whod've": "who'd've",
+        "who'dve": "who'd've",
+        'wholl': "who'll",
+        'whos': "who's",
+        'whove': "who've",
+        'whyll': "why'll",
+        'whyre': "why're",
+        'whys': "why's",
+        'wont': "won't",
+        'wouldve': "would've",
+        'wouldnt': "wouldn't",
+        "wouldnt've": "wouldn't've",
+        "wouldn'tve": "wouldn't've",
+        'yall': "y'all",
+        "yall'll": "y'all'll",
+        "y'allll": "y'all'll",
+        "yall'd've": "y'all'd've",
+        "y'alld've": "y'all'd've",
+        "y'all'dve": "y'all'd've",
+        'youd': "you'd",
+        "youd've": "you'd've",
+        "you'dve": "you'd've",
+        'youll': "you'll",
+        'youre': "you're",
+        'youve': "you've",
+    }
+    for word in tempText:
+        word = manualMap.setdefault(word, word)
+        if word not in articles:
+            outText.append(word)
+    for wordId, word in enumerate(outText):
+        if word in contractions:
+            outText[wordId] = contractions[word]
+    outText = ' '.join(outText)
+    return outText
+
+
+@METRICS.register_module()
+class VQAAcc(BaseMetric):
+    '''VQA Acc metric.
+    Args:
+
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Should be modified according to the
+            `retrieval_type` for unambiguous results. Defaults to TR.
+    '''
+    default_prefix = 'VQA'
+
+    def __init__(self,
+                 full_score_weight: float = 0.3,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.full_score_weight = full_score_weight
+
+    def process(self, data_batch, data_samples):
+        """Process one batch of data samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for sample in data_samples:
+            gt_answer = sample.get('gt_answer')
+            gt_answer_weight = sample.get('gt_answer_weight')
+            if isinstance(gt_answer, str):
+                gt_answer = [gt_answer]
+            if gt_answer_weight is None:
+                gt_answer_weight = [1. / (len(gt_answer))] * len(gt_answer)
+
+            result = {
+                'pred_answer': sample.get('pred_answer'),
+                'gt_answer': gt_answer,
+                'gt_answer_weight': gt_answer_weight,
+            }
+
+            self.results.append(result)
+
+    def compute_metrics(self, results: List):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (dict): The processed results of each batch.
+
+        Returns:
+            Dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        acc = []
+        for result in results:
+            pred_answer = self._process_answer(result['pred_answer'])
+            gt_answer = [
+                self._process_answer(answer) for answer in result['gt_answer']
+            ]
+            answer_weight = result['gt_answer_weight']
+
+            weight_sum = 0
+            for i, gt in enumerate(gt_answer):
+                if gt == pred_answer:
+                    weight_sum += answer_weight[i]
+            vqa_acc = min(1.0, weight_sum / self.full_score_weight)
+            acc.append(vqa_acc)
+
+        accuracy = sum(acc) / len(acc) * 100
+
+        metrics = {'acc': accuracy}
+        return metrics
+
+    def _process_answer(self, answer):
+        answer = answer.replace('\n', ' ')
+        answer = answer.replace('\t', ' ')
+        answer = answer.strip()
+        answer = _process_punctuation(answer)
+        answer = _process_digit_article(answer)
+        return answer
+
+
+@METRICS.register_module()
+class ReportVQA(BaseMetric):
+    """Dump VQA result to the standard json format for VQA evaluation.
+
+    Args:
+        file_path (str): The file path to save the result file.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Should be modified according to the
+            `retrieval_type` for unambiguous results. Defaults to TR.
+    """
+    default_prefix = 'VQA'
+
+    def __init__(self,
+                 file_path: str,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        if not file_path.endswith('.json'):
+            raise ValueError('The output file must be a json file.')
+        self.file_path = file_path
+
+    def process(self, data_batch, data_samples) -> None:
+        """transfer tensors in predictions to CPU."""
+        for sample in data_samples:
+            question_id = sample['question_id']
+            pred_answer = sample['pred_answer']
+
+            result = {
+                'question_id': int(question_id),
+                'answer': pred_answer,
+            }
+
+            self.results.append(result)
+
+    def compute_metrics(self, results: List):
+        """Dump the result to json file."""
+        mmengine.dump(results, self.file_path)
+        logger = MMLogger.get_current_instance()
+        logger.info(f'Results has been saved to {self.file_path}.')
+        return {}
+
+
+@METRICS.register_module()
+class VQAMCACC(BaseMetric):
+    '''VQA multiple choice Acc metric.
+    Args:
+
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Should be modified according to the
+            `retrieval_type` for unambiguous results. Defaults to TR.
+    '''
+    default_prefix = 'VQAMC'
+
+    def __init__(self,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+    def process(self, data_batch, data_samples):
+        """Process one batch of data samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for sample in data_samples:
+            # gt_labels in datasample is a LabelData
+            label = sample['gt_label'].item()
+            result = {
+                'pred_label': sample.get('pred_label'),
+                'gt_label': label,
+            }
+
+            self.results.append(result)
+
+    def compute_metrics(self, results: List):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (dict): The processed results of each batch.
+
+        Returns:
+            Dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        preds = np.array([x['pred_label'] for x in results])
+        labels = np.array([x['gt_label'] for x in results])
+
+        accuracy = np.sum(preds == labels) / len(preds) * 100
+
+        metrics = {'acc': accuracy}
+        return metrics
+
+
+@METRICS.register_module()
+class RetrievalRecall(BaseMetric):
+    r"""Recall evaluation metric for image retrieval.
+
+    Args:
+        topk (int | Sequence[int]): If the ground truth label matches one of
+            the best **k** predictions, the sample will be regard as a positive
+            prediction. If the parameter is a tuple, all of top-k recall will
+            be calculated and outputted together. Defaults to 1.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+
+    """
+    default_prefix: Optional[str] = 'retrieval'
+
+    def __init__(self,
+                 topk: Union[int, Sequence[int]],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        topk = (topk, ) if isinstance(topk, int) else topk
+
+        for k in topk:
+            if k <= 0:
+                raise ValueError('`topk` must be a ingter larger than 0 '
+                                 'or seq of ingter larger than 0.')
+
+        self.topk = topk
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+    def process(self, data_batch: Sequence[dict],
+                data_samples: Sequence[dict]):
+        """Process one batch of data and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch (Sequence[dict]): A batch of data from the dataloader.
+            predictions (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            pred_score = data_sample['pred_score'].cpu()
+            gt_label = format_label(data_sample['gt_label'])
+
+            if 'gt_score' in data_sample:
+                target = data_sample.get('gt_score').clone()
+            else:
+                num_classes = pred_score.size()[-1]
+                target = F.one_hot(gt_label, num_classes)
+
+            # Because the retrieval output logit vector will be much larger
+            # compared to the normal classification, to save resources, the
+            # evaluation results are computed each batch here and then reduce
+            #  all results at the end.
+            result = RetrievalRecall.calculate(
+                pred_score.unsqueeze(0), target.unsqueeze(0), topk=self.topk)
+            self.results.append(result)
+
+    def compute_metrics(self, results: List):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        result_metrics = dict()
+        for i, k in enumerate(self.topk):
+            recall_at_k = sum([r[i].item() for r in results]) / len(results)
+            result_metrics[f'Recall@{k}'] = recall_at_k
+
+        return result_metrics
+
+    @staticmethod
+    def calculate(pred: Union[np.ndarray, torch.Tensor],
+                  target: Union[np.ndarray, torch.Tensor],
+                  topk: Union[int, Sequence[int]],
+                  pred_indices: (bool) = False,
+                  target_indices: (bool) = False) -> float:
+        """Calculate the average recall.
+
+        Args:
+            pred (torch.Tensor | np.ndarray | Sequence): The prediction
+                results. A :obj:`torch.Tensor` or :obj:`np.ndarray` with
+                shape ``(N, M)`` or a sequence of index/onehot
+                format labels.
+            target (torch.Tensor | np.ndarray | Sequence): The prediction
+                results. A :obj:`torch.Tensor` or :obj:`np.ndarray` with
+                shape ``(N, M)`` or a sequence of index/onehot
+                format labels.
+            topk (int, Sequence[int]): Predictions with the k-th highest
+                scores are considered as positive.
+            pred_indices (bool): Whether the ``pred`` is a sequence of
+                category index labels. Defaults to False.
+            target_indices (bool): Whether the ``target`` is a sequence of
+                category index labels. Defaults to False.
+
+        Returns:
+            List[float]: the average recalls.
+        """
+        topk = (topk, ) if isinstance(topk, int) else topk
+        for k in topk:
+            if k <= 0:
+                raise ValueError('`topk` must be a ingter larger than 0 '
+                                 'or seq of ingter larger than 0.')
+
+        max_keep = max(topk)
+        pred = _format_pred(pred, max_keep, pred_indices)
+        target = _format_target(target, target_indices)
+
+        assert len(pred) == len(target), (
+            f'Length of `pred`({len(pred)}) and `target` ({len(target)}) '
+            f'must be the same.')
+
+        num_samples = len(pred)
+        results = []
+        for k in topk:
+            recalls = torch.zeros(num_samples)
+            for i, (sample_pred,
+                    sample_target) in enumerate(zip(pred, target)):
+                sample_pred = np.array(to_tensor(sample_pred).cpu())
+                sample_target = np.array(to_tensor(sample_target).cpu())
+                recalls[i] = int(np.in1d(sample_pred[:k], sample_target).max())
+            results.append(recalls.mean() * 100)
+        return results
+
+
+def _format_pred(label, topk=None, is_indices=False):
+    """format various label to List[indices]."""
+    if is_indices:
+        assert isinstance(label, Sequence),  \
+                '`pred` must be Sequence of indices when' \
+                f' `pred_indices` set to True, but get {type(label)}'
+        for i, sample_pred in enumerate(label):
+            assert is_seq_of(sample_pred, int) or isinstance(
+                sample_pred, (np.ndarray, torch.Tensor)), \
+                '`pred` should be Sequence of indices when `pred_indices`' \
+                f'set to True. but pred[{i}] is {sample_pred}'
+            if topk:
+                label[i] = sample_pred[:min(topk, len(sample_pred))]
+        return label
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    elif not isinstance(label, torch.Tensor):
+        raise TypeError(f'The pred must be type of torch.tensor, '
+                        f'np.ndarray or Sequence but get {type(label)}.')
+    topk = topk if topk else label.size()[-1]
+    _, indices = label.topk(topk)
+    return indices
+
+
+def _format_target(label, is_indices=False):
+    """format various label to List[indices]."""
+    if is_indices:
+        assert isinstance(label, Sequence),  \
+                '`target` must be Sequence of indices when' \
+                f' `target_indices` set to True, but get {type(label)}'
+        for i, sample_gt in enumerate(label):
+            assert is_seq_of(sample_gt, int) or isinstance(
+                sample_gt, (np.ndarray, torch.Tensor)), \
+                '`target` should be Sequence of indices when ' \
+                f'`target_indices` set to True. but target[{i}] is {sample_gt}'
+        return label
+
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    elif isinstance(label, Sequence) and not mmengine.is_str(label):
+        label = torch.tensor(label)
+    elif not isinstance(label, torch.Tensor):
+        raise TypeError(f'The pred must be type of torch.tensor, '
+                        f'np.ndarray or Sequence but get {type(label)}.')
+
+    indices = [sample_gt.nonzero().squeeze(-1) for sample_gt in label]
+    return indices
diff --git a/mmaction/evaluation/metrics/multisports_metric.py b/mmaction/evaluation/metrics/multisports_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b806d7863d5c86877b51c3e3e74481ea64714a7
--- /dev/null
+++ b/mmaction/evaluation/metrics/multisports_metric.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Sequence, Tuple
+
+import numpy as np
+from mmengine import load
+from mmengine.evaluator import BaseMetric
+
+from mmaction.evaluation import frameAP, link_tubes, videoAP, videoAP_all
+from mmaction.registry import METRICS
+
+
+@METRICS.register_module()
+class MultiSportsMetric(BaseMetric):
+    """MAP Metric for MultiSports dataset."""
+    default_prefix: Optional[str] = 'mAP'
+
+    def __init__(self,
+                 ann_file: str,
+                 metric_options: Optional[dict] = dict(
+                     F_mAP=dict(thr=(0.5)),
+                     V_mAP=dict(thr=(0.2, 0.5), all=True, tube_thr=15)),
+                 collect_device: str = 'cpu',
+                 verbose: bool = True,
+                 prefix: Optional[str] = None):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.metric_options = metric_options
+        self.annos = load(ann_file)
+        self.verbose = verbose
+
+    def process(self, data_batch: Sequence[Tuple[Any, dict]],
+                data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (Sequence[Tuple[Any, dict]]): A batch of data
+                from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+
+        for pred in data_samples:
+            video_key = pred['video_id'].split('.mp4')[0]
+            frm_num = pred['timestamp']
+            bboxes = pred['pred_instances']['bboxes'].cpu().numpy()
+            cls_scores = pred['pred_instances']['scores'].cpu().numpy()
+            det_result = [video_key, frm_num, bboxes, cls_scores]
+
+            self.results.append(det_result)
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        test_videos = self.annos['test_videos'][0]
+        resolutions = self.annos['resolution']
+        detections = []
+        for result in results:
+            video_key, frm_num, bboxes, cls_scores = result
+            for bbox, cls_score in zip(bboxes, cls_scores):
+                video_idx = test_videos.index(video_key)
+                pred_label = np.argmax(cls_score)
+                score = cls_score[pred_label]
+                h, w = resolutions[video_key]
+                bbox *= np.array([w, h, w, h])
+                instance_result = np.array(
+                    [video_idx, frm_num, pred_label, score, *bbox])
+                detections.append(instance_result)
+
+        frm_detections = np.array(detections)
+
+        metric_result = dict()
+        f_map = frameAP(self.annos, frm_detections,
+                        self.metric_options['F_mAP']['thr'], self.verbose)
+        metric_result.update({'frameAP': round(f_map, 4)})
+        video_tubes = link_tubes(
+            self.annos,
+            frm_detections,
+            len_thre=self.metric_options['V_mAP']['tube_thr'])
+
+        v_map = {}
+        for thr in self.metric_options['V_mAP']['thr']:
+            map = videoAP(
+                self.annos, video_tubes, thr, print_info=self.verbose)
+            v_map.update({f'v_map@{thr}': round(map, 4)})
+            metric_result.update(v_map)
+        if self.metric_options['V_mAP'].get('all'):
+            all_map = videoAP_all(self.annos, video_tubes)
+            metric_result.update(all_map)
+        return metric_result
diff --git a/mmaction/evaluation/metrics/retrieval_metric.py b/mmaction/evaluation/metrics/retrieval_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cf847d5f95ff17519fc216aab02379d586cd0f1
--- /dev/null
+++ b/mmaction/evaluation/metrics/retrieval_metric.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+
+from mmaction.registry import METRICS
+
+
+@METRICS.register_module()
+class RetrievalMetric(BaseMetric):
+    """Metric for video retrieval task.
+
+    Args:
+        metric_list (str | tuple[str]): The list of the metrics to be
+            computed. Defaults to ``('R1', 'R5', 'R10', 'MdR', 'MnR')``.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    default_prefix = 'retrieval'
+
+    def __init__(self,
+                 metric_list: Union[Tuple[str],
+                                    str] = ('R1', 'R5', 'R10', 'MdR', 'MnR'),
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        if isinstance(metric_list, str):
+            metric_list = (metric_list, )
+
+        for metric in metric_list:
+            if metric not in ['R1', 'R5', 'R10', 'MdR', 'MnR']:
+                raise ValueError(f'RetrievalMetric only supports '
+                                 f"'R1', 'R5', 'R10', 'MdR', 'MnR', "
+                                 f"but got '{metric}. '")
+
+        self.metric_list = metric_list
+
+    def process(self, data_batch: Optional[Dict],
+                data_samples: Sequence[Dict]) -> None:
+        """Process one batch of data samples and data_samples. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict, optional): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        data_samples = copy.deepcopy(data_samples)
+
+        for data_sample in data_samples:
+            results = dict()
+            features = data_sample['features']
+            video_feature = features['video_feature'].cpu().numpy()
+            text_feature = features['text_feature'].cpu().numpy()
+            results['video_feature'] = video_feature
+            results['text_feature'] = text_feature
+            self.results.append(results)
+
+    def compute_metrics(self, results: List) -> Dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+
+        video_features = np.stack([res['video_feature'] for res in results])
+        text_features = np.stack([res['text_feature'] for res in results])
+
+        video_features = video_features / np.linalg.norm(
+            video_features, axis=-1, keepdims=True)
+        text_features = text_features / np.linalg.norm(
+            text_features, axis=-1, keepdims=True)
+
+        similarity = text_features @ video_features.T
+
+        sx = np.sort(-similarity)
+        d = np.diag(-similarity)
+        ind = np.where((sx - d[:, None]) == 0)[1]
+
+        metrics = OrderedDict()
+        for metric in self.metric_list:
+            if metric == 'R1':
+                metrics['R1'] = float(np.sum(ind == 0)) * 100 / len(ind)
+            elif metric == 'R5':
+                metrics['R5'] = float(np.sum(ind < 5)) * 100 / len(ind)
+            elif metric == 'R10':
+                metrics['R10'] = float(np.sum(ind < 10)) * 100 / len(ind)
+            elif metric == 'MdR':
+                metrics['MdR'] = np.median(ind) + 1
+            elif metric == 'MnR':
+                metrics['MnR'] = np.mean(ind) + 1
+
+        return metrics
diff --git a/mmaction/evaluation/metrics/video_grounding_metric.py b/mmaction/evaluation/metrics/video_grounding_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba9307fb145956766d57c8f5f7a79cfc204196aa
--- /dev/null
+++ b/mmaction/evaluation/metrics/video_grounding_metric.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Sequence, Tuple
+
+from mmengine.evaluator import BaseMetric
+
+from mmaction.registry import METRICS
+
+
+@METRICS.register_module()
+class RecallatTopK(BaseMetric):
+    """ActivityNet dataset evaluation metric."""
+
+    def __init__(self,
+                 topK_list: Tuple[int] = (1, 5),
+                 threshold: float = 0.5,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.topK_list = topK_list
+        self.threshold = threshold
+
+    def process(self, data_batch: Sequence[Tuple[Any, dict]],
+                predictions: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (Sequence[Tuple[Any, dict]]): A batch of data
+                from the dataloader.
+            predictions (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+        for pred in predictions:
+            self.results.append(pred)
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        eval_results = dict()
+        for topK in self.topK_list:
+            total = len(results)
+            correct = 0.0
+            for result in results:
+                gt = result['gt']
+                predictions = result['predictions'][:topK]
+                for prediction in predictions:
+                    IoU = self.calculate_IoU(gt, prediction)
+                    if IoU > self.threshold:
+                        correct += 1
+                        break
+            acc = correct / total
+            eval_results[f'Recall@Top{topK}_IoU={self.threshold}'] = acc
+        return eval_results
+
+    def calculate_IoU(self, i0, i1):
+        union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
+        inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
+        iou = (inter[1] - inter[0]) / (union[1] - union[0])
+        return iou
diff --git a/mmaction/models/__init__.py b/mmaction/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6964616c5219a88f78571b4737798d18ec6721a
--- /dev/null
+++ b/mmaction/models/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backbones import *  # noqa: F401,F403
+from .common import *  # noqa: F401,F403
+from .data_preprocessors import *  # noqa: F401,F403
+from .heads import *  # noqa: F401,F403
+from .localizers import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .multimodal import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .recognizers import *  # noqa: F401,F403
+from .roi_heads import *  # noqa: F401,F403
+from .similarity import *  # noqa: F401,F403
+from .task_modules import *  # noqa: F401,F403
+from .utils import *  # noqa: F401,F403
diff --git a/mmaction/models/__pycache__/__init__.cpython-312.pyc b/mmaction/models/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d76c88a7b43572f883693021b1c9483e8fab6b32
Binary files /dev/null and b/mmaction/models/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__init__.py b/mmaction/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b1cd9e76a2fde3e541001c5189d0825c736fc7b
--- /dev/null
+++ b/mmaction/models/backbones/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .aagcn import AAGCN
+from .c2d import C2D
+from .c3d import C3D
+from .mobilenet_v2 import MobileNetV2
+from .mobilenet_v2_tsm import MobileNetV2TSM
+from .mvit import MViT
+from .resnet import ResNet
+from .resnet2plus1d import ResNet2Plus1d
+from .resnet3d import ResNet3d, ResNet3dLayer
+from .resnet3d_csn import ResNet3dCSN
+from .resnet3d_slowfast import ResNet3dSlowFast
+from .resnet3d_slowonly import ResNet3dSlowOnly
+from .resnet_audio import ResNetAudio
+from .resnet_omni import OmniResNet
+from .resnet_tin import ResNetTIN
+from .resnet_tsm import ResNetTSM
+from .rgbposeconv3d import RGBPoseConv3D
+from .stgcn import STGCN
+from .swin import SwinTransformer3D
+from .tanet import TANet
+from .timesformer import TimeSformer
+from .uniformer import UniFormer
+from .uniformerv2 import UniFormerV2
+from .vit_mae import VisionTransformer
+from .x3d import X3D
+
+__all__ = [
+    'AAGCN', 'C2D', 'C3D', 'MViT', 'MobileNetV2', 'MobileNetV2TSM',
+    'OmniResNet', 'ResNet', 'ResNet2Plus1d', 'ResNet3d', 'ResNet3dCSN',
+    'ResNet3dLayer', 'ResNet3dSlowFast', 'ResNet3dSlowOnly', 'ResNetAudio',
+    'ResNetTIN', 'ResNetTSM', 'STGCN', 'SwinTransformer3D', 'TANet',
+    'TimeSformer', 'UniFormer', 'UniFormerV2', 'VisionTransformer', 'X3D',
+    'RGBPoseConv3D'
+]
+
+try:
+    from .mobileone_tsm import MobileOneTSM  # noqa: F401
+    __all__.append('MobileOneTSM')
+
+except (ImportError, ModuleNotFoundError):
+    pass
diff --git a/mmaction/models/backbones/__pycache__/__init__.cpython-312.pyc b/mmaction/models/backbones/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ef6ee5e9466ddbf791f65e84cc4f176dc0f7573
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/aagcn.cpython-312.pyc b/mmaction/models/backbones/__pycache__/aagcn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e2ac2d79a82acc468f8955451c49eae9b688690
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/aagcn.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/c2d.cpython-312.pyc b/mmaction/models/backbones/__pycache__/c2d.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3b769785e987c04c6008bd7701cb720003836dd
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/c2d.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/c3d.cpython-312.pyc b/mmaction/models/backbones/__pycache__/c3d.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..059652c216f1e3965f2be32c2ab5da29b5f63dab
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/c3d.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/mobilenet_v2.cpython-312.pyc b/mmaction/models/backbones/__pycache__/mobilenet_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d62d9ebfd2ccbcc57df1edd4913fdf94f77cc68e
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/mobilenet_v2.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/mobilenet_v2_tsm.cpython-312.pyc b/mmaction/models/backbones/__pycache__/mobilenet_v2_tsm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d1571e71c458054c4e07b385dd1671606929490
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/mobilenet_v2_tsm.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/mobileone_tsm.cpython-312.pyc b/mmaction/models/backbones/__pycache__/mobileone_tsm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..612f30667ba0aec62d2ff446b4a505dee8398519
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/mobileone_tsm.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/mvit.cpython-312.pyc b/mmaction/models/backbones/__pycache__/mvit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71e6e9ee3941d6209ff516ae165dc14dd6a67389
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/mvit.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/resnet.cpython-312.pyc b/mmaction/models/backbones/__pycache__/resnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dc552094c3881b6232d0aaec5d9ccb49b4140b2
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/resnet2plus1d.cpython-312.pyc b/mmaction/models/backbones/__pycache__/resnet2plus1d.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58053a9175201c0846386725c8ea36562cf28159
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet2plus1d.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/resnet3d.cpython-312.pyc b/mmaction/models/backbones/__pycache__/resnet3d.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74c1ebbc8e62c72d41501e336452c3761f0fc349
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet3d.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/resnet3d_csn.cpython-312.pyc b/mmaction/models/backbones/__pycache__/resnet3d_csn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..376e65b480add075d760527f5ba392a567a2f4a3
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet3d_csn.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/resnet3d_slowfast.cpython-312.pyc b/mmaction/models/backbones/__pycache__/resnet3d_slowfast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8ae682886c27e6258b588ae97306c309d5e8413
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet3d_slowfast.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/resnet3d_slowonly.cpython-312.pyc b/mmaction/models/backbones/__pycache__/resnet3d_slowonly.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99da07a2551b001c3c2a25f9507a4fbe3a94a943
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet3d_slowonly.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/resnet_audio.cpython-312.pyc b/mmaction/models/backbones/__pycache__/resnet_audio.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e419ed72f5293461f498987e6848e31c188f2fac
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet_audio.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/resnet_omni.cpython-312.pyc b/mmaction/models/backbones/__pycache__/resnet_omni.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ace72fd3e56dd5fb7405d2224bb12c0c6697868e
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet_omni.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/resnet_tin.cpython-312.pyc b/mmaction/models/backbones/__pycache__/resnet_tin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e405042592a9629c5d55419c8ad16005c9de797
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet_tin.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/resnet_tsm.cpython-312.pyc b/mmaction/models/backbones/__pycache__/resnet_tsm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..716bc2691eae322c29f95a7da38f339941d42007
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/resnet_tsm.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/rgbposeconv3d.cpython-312.pyc b/mmaction/models/backbones/__pycache__/rgbposeconv3d.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..922043fb8d81bce2dd32b05643d569b376de239a
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/rgbposeconv3d.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/stgcn.cpython-312.pyc b/mmaction/models/backbones/__pycache__/stgcn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fd224ad52bd783ad9c3263052fd06a168b312c5
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/stgcn.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/swin.cpython-312.pyc b/mmaction/models/backbones/__pycache__/swin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80a24229d2d5a32fc2c71b4d05d34508d106e0c1
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/swin.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/tanet.cpython-312.pyc b/mmaction/models/backbones/__pycache__/tanet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c41b724110ba2297bb93d2fb185384ac3f4776aa
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/tanet.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/timesformer.cpython-312.pyc b/mmaction/models/backbones/__pycache__/timesformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..844b2edaefb48dccc35a16de83674ae5f222d765
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/timesformer.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/uniformer.cpython-312.pyc b/mmaction/models/backbones/__pycache__/uniformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f211abe297d56eeb6c85e9a149f14cf8ac2ffd1
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/uniformer.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/uniformerv2.cpython-312.pyc b/mmaction/models/backbones/__pycache__/uniformerv2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a96b772e37c8900f5e6a9b38dd4207fa8035d9de
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/uniformerv2.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/vit_mae.cpython-312.pyc b/mmaction/models/backbones/__pycache__/vit_mae.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e70a3809747eedb1511272c13edaf3277b6ea276
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/vit_mae.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/__pycache__/x3d.cpython-312.pyc b/mmaction/models/backbones/__pycache__/x3d.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5c294c858dfd149dc010baf64d80310db374b32
Binary files /dev/null and b/mmaction/models/backbones/__pycache__/x3d.cpython-312.pyc differ
diff --git a/mmaction/models/backbones/aagcn.py b/mmaction/models/backbones/aagcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..42a085bed65cc7811375de9324d410d3cf6e8652
--- /dev/null
+++ b/mmaction/models/backbones/aagcn.py
@@ -0,0 +1,236 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule, ModuleList
+
+from mmaction.registry import MODELS
+from ..utils import Graph, unit_aagcn, unit_tcn
+
+
+class AAGCNBlock(BaseModule):
+    """The basic block of AAGCN.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        A (torch.Tensor): The adjacency matrix defined in the graph
+            with shape of `(num_subsets, num_nodes, num_nodes)`.
+        stride (int): Stride of the temporal convolution. Defaults to 1.
+        residual (bool): Whether to use residual connection. Defaults to True.
+        init_cfg (dict or list[dict], optional): Config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 A: torch.Tensor,
+                 stride: int = 1,
+                 residual: bool = True,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        gcn_kwargs = {k[4:]: v for k, v in kwargs.items() if k[:4] == 'gcn_'}
+        tcn_kwargs = {k[4:]: v for k, v in kwargs.items() if k[:4] == 'tcn_'}
+        kwargs = {
+            k: v
+            for k, v in kwargs.items() if k[:4] not in ['gcn_', 'tcn_']
+        }
+        assert len(kwargs) == 0, f'Invalid arguments: {kwargs}'
+
+        tcn_type = tcn_kwargs.pop('type', 'unit_tcn')
+        assert tcn_type in ['unit_tcn', 'mstcn']
+        gcn_type = gcn_kwargs.pop('type', 'unit_aagcn')
+        assert gcn_type in ['unit_aagcn']
+
+        self.gcn = unit_aagcn(in_channels, out_channels, A, **gcn_kwargs)
+
+        if tcn_type == 'unit_tcn':
+            self.tcn = unit_tcn(
+                out_channels, out_channels, 9, stride=stride, **tcn_kwargs)
+
+        self.relu = nn.ReLU()
+
+        if not residual:
+            self.residual = lambda x: 0
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+        else:
+            self.residual = unit_tcn(
+                in_channels, out_channels, kernel_size=1, stride=stride)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        return self.relu(self.tcn(self.gcn(x)) + self.residual(x))
+
+
+@MODELS.register_module()
+class AAGCN(BaseModule):
+    """AAGCN backbone, the attention-enhanced version of 2s-AGCN.
+
+    Skeleton-Based Action Recognition with Multi-Stream
+    Adaptive Graph Convolutional Networks.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1912.06971>`__ .
+
+    Two-Stream Adaptive Graph Convolutional Networks for
+    Skeleton-Based Action Recognition.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1805.07694>`__ .
+
+    Args:
+        graph_cfg (dict): Config for building the graph.
+        in_channels (int): Number of input channels. Defaults to 3.
+        base_channels (int): Number of base channels. Defaults to 64.
+        data_bn_type (str): Type of the data bn layer. Defaults to ``'MVC'``.
+        num_person (int): Maximum number of people. Only used when
+            data_bn_type == 'MVC'. Defaults to 2.
+        num_stages (int): Total number of stages. Defaults to 10.
+        inflate_stages (list[int]): Stages to inflate the number of channels.
+            Defaults to ``[5, 8]``.
+        down_stages (list[int]): Stages to perform downsampling in
+            the time dimension. Defaults to ``[5, 8]``.
+        init_cfg (dict or list[dict], optional): Config to control
+            the initialization. Defaults to None.
+
+        Examples:
+        >>> import torch
+        >>> from mmaction.models import AAGCN
+        >>> from mmaction.utils import register_all_modules
+        >>>
+        >>> register_all_modules()
+        >>> mode = 'stgcn_spatial'
+        >>> batch_size, num_person, num_frames = 2, 2, 150
+        >>>
+        >>> # openpose-18 layout
+        >>> num_joints = 18
+        >>> model = AAGCN(graph_cfg=dict(layout='openpose', mode=mode))
+        >>> model.init_weights()
+        >>> inputs = torch.randn(batch_size, num_person,
+        ...                      num_frames, num_joints, 3)
+        >>> output = model(inputs)
+        >>> print(output.shape)
+        >>>
+        >>> # nturgb+d layout
+        >>> num_joints = 25
+        >>> model = AAGCN(graph_cfg=dict(layout='nturgb+d', mode=mode))
+        >>> model.init_weights()
+        >>> inputs = torch.randn(batch_size, num_person,
+        ...                      num_frames, num_joints, 3)
+        >>> output = model(inputs)
+        >>> print(output.shape)
+        >>>
+        >>> # coco layout
+        >>> num_joints = 17
+        >>> model = AAGCN(graph_cfg=dict(layout='coco', mode=mode))
+        >>> model.init_weights()
+        >>> inputs = torch.randn(batch_size, num_person,
+        ...                      num_frames, num_joints, 3)
+        >>> output = model(inputs)
+        >>> print(output.shape)
+        >>>
+        >>> # custom settings
+        >>> # disable the attention module to degenerate AAGCN to AGCN
+        >>> model = AAGCN(graph_cfg=dict(layout='coco', mode=mode),
+        ...               gcn_attention=False)
+        >>> model.init_weights()
+        >>> output = model(inputs)
+        >>> print(output.shape)
+        torch.Size([2, 2, 256, 38, 18])
+        torch.Size([2, 2, 256, 38, 25])
+        torch.Size([2, 2, 256, 38, 17])
+        torch.Size([2, 2, 256, 38, 17])
+    """
+
+    def __init__(self,
+                 graph_cfg: Dict,
+                 in_channels: int = 3,
+                 base_channels: int = 64,
+                 data_bn_type: str = 'MVC',
+                 num_person: int = 2,
+                 num_stages: int = 10,
+                 inflate_stages: List[int] = [5, 8],
+                 down_stages: List[int] = [5, 8],
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.graph = Graph(**graph_cfg)
+        A = torch.tensor(
+            self.graph.A, dtype=torch.float32, requires_grad=False)
+        self.register_buffer('A', A)
+
+        assert data_bn_type in ['MVC', 'VC', None]
+        self.data_bn_type = data_bn_type
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+        self.num_person = num_person
+        self.num_stages = num_stages
+        self.inflate_stages = inflate_stages
+        self.down_stages = down_stages
+
+        if self.data_bn_type == 'MVC':
+            self.data_bn = nn.BatchNorm1d(num_person * in_channels * A.size(1))
+        elif self.data_bn_type == 'VC':
+            self.data_bn = nn.BatchNorm1d(in_channels * A.size(1))
+        else:
+            self.data_bn = nn.Identity()
+
+        lw_kwargs = [cp.deepcopy(kwargs) for i in range(num_stages)]
+        for k, v in kwargs.items():
+            if isinstance(v, tuple) and len(v) == num_stages:
+                for i in range(num_stages):
+                    lw_kwargs[i][k] = v[i]
+        lw_kwargs[0].pop('tcn_dropout', None)
+
+        modules = []
+        if self.in_channels != self.base_channels:
+            modules = [
+                AAGCNBlock(
+                    in_channels,
+                    base_channels,
+                    A.clone(),
+                    1,
+                    residual=False,
+                    **lw_kwargs[0])
+            ]
+
+        for i in range(2, num_stages + 1):
+            in_channels = base_channels
+            out_channels = base_channels * (1 + (i in inflate_stages))
+            stride = 1 + (i in down_stages)
+            modules.append(
+                AAGCNBlock(
+                    base_channels,
+                    out_channels,
+                    A.clone(),
+                    stride=stride,
+                    **lw_kwargs[i - 1]))
+            base_channels = out_channels
+
+        if self.in_channels == self.base_channels:
+            self.num_stages -= 1
+
+        self.gcn = ModuleList(modules)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        N, M, T, V, C = x.size()
+        x = x.permute(0, 1, 3, 4, 2).contiguous()
+        if self.data_bn_type == 'MVC':
+            x = self.data_bn(x.view(N, M * V * C, T))
+        else:
+            x = self.data_bn(x.view(N * M, V * C, T))
+
+        x = x.view(N, M, V, C, T).permute(0, 1, 3, 4,
+                                          2).contiguous().view(N * M, C, T, V)
+
+        for i in range(self.num_stages):
+            x = self.gcn[i](x)
+
+        x = x.reshape((N, M) + x.shape[1:])
+        return x
diff --git a/mmaction/models/backbones/c2d.py b/mmaction/models/backbones/c2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..42ca5eb9e697f8f10267302e3729a8e8c7aacb55
--- /dev/null
+++ b/mmaction/models/backbones/c2d.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmaction.models.backbones.resnet import ResNet
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class C2D(ResNet):
+    """C2D backbone.
+
+    Compared to ResNet-50, a temporal-pool is added after the first
+    bottleneck. Detailed structure is kept same as "video-nonlocal-net" repo.
+    Please refer to https://github.com/facebookresearch/video-nonlocal-net/blob
+    /main/scripts/run_c2d_baseline_400k.sh.
+    Please note that there are some improvements compared to "Non-local Neural
+    Networks" paper (https://arxiv.org/abs/1711.07971).
+    Differences are noted at https://github.com/facebookresearch/video-nonlocal
+    -net#modifications-for-improving-speed.
+    """
+
+    def _make_stem_layer(self) -> None:
+        """Construct the stem layers consists of a conv+norm+act module and a
+        pooling layer."""
+        self.conv1 = ConvModule(
+            self.in_channels,
+            64,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.maxpool3d_1 = nn.MaxPool3d(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 0, 0))
+        self.maxpool3d_2 = nn.MaxPool3d(
+            kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+
+    def forward(self, x: torch.Tensor) \
+            -> Union[torch.Tensor, Tuple[torch.Tensor]]:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            Union[torch.Tensor or Tuple[torch.Tensor]]: The feature of the
+                input samples extracted by the backbone.
+        """
+
+        batches = x.shape[0]
+
+        def _convert_to_2d(x: torch.Tensor) -> torch.Tensor:
+            """(N, C, T, H, W) -> (N x T, C, H, W)"""
+            x = x.permute((0, 2, 1, 3, 4))
+            x = x.reshape(-1, x.shape[2], x.shape[3], x.shape[4])
+            return x
+
+        def _convert_to_3d(x: torch.Tensor) -> torch.Tensor:
+            """(N x T, C, H, W) -> (N, C, T, H, W)"""
+            x = x.reshape(batches, -1, x.shape[1], x.shape[2], x.shape[3])
+            x = x.permute((0, 2, 1, 3, 4))
+            return x
+
+        x = _convert_to_2d(x)
+        x = self.conv1(x)
+        x = _convert_to_3d(x)
+        x = self.maxpool3d_1(x)
+        x = _convert_to_2d(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i == 0:
+                x = _convert_to_3d(x)
+                x = self.maxpool3d_2(x)
+                x = _convert_to_2d(x)
+            if i in self.out_indices:
+                x = _convert_to_3d(x)
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+
+        return tuple(outs)
diff --git a/mmaction/models/backbones/c3d.py b/mmaction/models/backbones/c3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..9feee6652397d42ba6e2f0600721273cc0b75a70
--- /dev/null
+++ b/mmaction/models/backbones/c3d.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.logging import MMLogger
+from mmengine.model.weight_init import constant_init, kaiming_init, normal_init
+from mmengine.runner import load_checkpoint
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class C3D(nn.Module):
+    """C3D backbone.
+
+    Args:
+        pretrained (str | None): Name of pretrained model.
+        style (str): ``pytorch`` or ``caffe``. If set to "pytorch", the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Default: 'pytorch'.
+        conv_cfg (dict | None): Config dict for convolution layer.
+            If set to None, it uses ``dict(type='Conv3d')`` to construct
+            layers. Default: None.
+        norm_cfg (dict | None): Config for norm layers. required keys are
+            ``type``, Default: None.
+        act_cfg (dict | None): Config dict for activation layer. If set to
+            None, it uses ``dict(type='ReLU')`` to construct layers.
+            Default: None.
+        out_dim (int): The dimension of last layer feature (after flatten).
+            Depends on the input shape. Default: 8192.
+        dropout_ratio (float): Probability of dropout layer. Default: 0.5.
+        init_std (float): Std value for Initiation of fc layers. Default: 0.01.
+    """
+
+    def __init__(self,
+                 pretrained=None,
+                 style='pytorch',
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 out_dim=8192,
+                 dropout_ratio=0.5,
+                 init_std=0.005):
+        super().__init__()
+        if conv_cfg is None:
+            conv_cfg = dict(type='Conv3d')
+        if act_cfg is None:
+            act_cfg = dict(type='ReLU')
+        self.pretrained = pretrained
+        self.style = style
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.dropout_ratio = dropout_ratio
+        self.init_std = init_std
+
+        c3d_conv_param = dict(
+            kernel_size=(3, 3, 3),
+            padding=(1, 1, 1),
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.conv1a = ConvModule(3, 64, **c3d_conv_param)
+        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
+
+        self.conv2a = ConvModule(64, 128, **c3d_conv_param)
+        self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
+
+        self.conv3a = ConvModule(128, 256, **c3d_conv_param)
+        self.conv3b = ConvModule(256, 256, **c3d_conv_param)
+        self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
+
+        self.conv4a = ConvModule(256, 512, **c3d_conv_param)
+        self.conv4b = ConvModule(512, 512, **c3d_conv_param)
+        self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
+
+        self.conv5a = ConvModule(512, 512, **c3d_conv_param)
+        self.conv5b = ConvModule(512, 512, **c3d_conv_param)
+        self.pool5 = nn.MaxPool3d(
+            kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))
+
+        self.fc6 = nn.Linear(out_dim, 4096)
+        self.fc7 = nn.Linear(4096, 4096)
+
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(p=self.dropout_ratio)
+
+    def init_weights(self):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            logger.info(f'load model from: {self.pretrained}')
+
+            load_checkpoint(self, self.pretrained, strict=False, logger=logger)
+
+        elif self.pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv3d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=self.init_std)
+                elif isinstance(m, _BatchNorm):
+                    constant_init(m, 1)
+
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+                the size of x is (num_batches, 3, 16, 112, 112).
+
+        Returns:
+            torch.Tensor: The feature of the input
+            samples extracted by the backbone.
+        """
+        x = self.conv1a(x)
+        x = self.pool1(x)
+
+        x = self.conv2a(x)
+        x = self.pool2(x)
+
+        x = self.conv3a(x)
+        x = self.conv3b(x)
+        x = self.pool3(x)
+
+        x = self.conv4a(x)
+        x = self.conv4b(x)
+        x = self.pool4(x)
+
+        x = self.conv5a(x)
+        x = self.conv5b(x)
+        x = self.pool5(x)
+
+        x = x.flatten(start_dim=1)
+        x = self.relu(self.fc6(x))
+        x = self.dropout(x)
+        x = self.relu(self.fc7(x))
+
+        return x
diff --git a/mmaction/models/backbones/mobilenet_v2.py b/mmaction/models/backbones/mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..daa32e725f841fcb00a6e86807e0459f13e23211
--- /dev/null
+++ b/mmaction/models/backbones/mobilenet_v2.py
@@ -0,0 +1,324 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Union
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.registry import MODELS
+
+
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+
+    This function rounds the channel number down to the nearest value that can
+    be divisible by the divisor.
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int, optional): The minimum value of the output channel.
+            Defaults to None, means that the minimum value equal to the
+            divisor.
+        min_ratio (float, optional): The minimum ratio of the rounded channel
+            number to the original channel number. Defaults to 0.9.
+    Returns:
+        int: The modified output channel number
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
+
+
+class InvertedResidual(nn.Module):
+    """InvertedResidual block for MobileNetV2.
+
+    Args:
+        in_channels (int): The input channels of the InvertedResidual block.
+        out_channels (int): The output channels of the InvertedResidual block.
+        stride (int): Stride of the middle (first) 3x3 convolution.
+        expand_ratio (int): adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+        conv_cfg (dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU6').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+    Returns:
+        Tensor: The output tensor
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 expand_ratio,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 with_cp=False):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2], f'stride must in [1, 2]. ' \
+            f'But received {stride}.'
+        self.with_cp = with_cp
+        self.use_res_connect = self.stride == 1 and in_channels == out_channels
+        hidden_dim = int(round(in_channels * expand_ratio))
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=hidden_dim,
+                    kernel_size=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        layers.extend([
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=hidden_dim,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=out_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+
+        Returns:
+            Tensor: The output of the module.
+        """
+
+        def _inner_forward(x):
+            if self.use_res_connect:
+                return x + self.conv(x)
+
+            return self.conv(x)
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+@MODELS.register_module()
+class MobileNetV2(BaseModule):
+    """MobileNetV2 backbone.
+
+    Args:
+        pretrained (str | None): Name of pretrained model. Defaults to None.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        out_indices (None or Sequence[int]): Output from which stages.
+            Defaults to (7, ).
+        frozen_stages (int): Stages to be frozen (all param fixed). Note that
+            the last stage in ``MobileNetV2`` is ``conv2``. Defaults to -1,
+            which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU6').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict or list[dict]): Initialization config dict. Defaults to
+            ``[
+            dict(type='Kaiming', layer='Conv2d',),
+            dict(type='Constant', layer=['GroupNorm', '_BatchNorm'], val=1.)
+            ]``.
+    """
+
+    # Parameters to build layers. 4 parameters are needed to construct a
+    # layer, from left to right: expand_ratio, channel, num_blocks, stride.
+    arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2],
+                     [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2],
+                     [6, 320, 1, 1]]
+
+    def __init__(self,
+                 pretrained=None,
+                 widen_factor=1.,
+                 out_indices=(7, ),
+                 frozen_stages=-1,
+                 conv_cfg=dict(type='Conv'),
+                 norm_cfg=dict(type='BN2d', requires_grad=True),
+                 act_cfg=dict(type='ReLU6', inplace=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = [
+                     dict(type='Kaiming', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=['GroupNorm', '_BatchNorm'],
+                         val=1.)
+                 ]):
+        if pretrained is not None:
+            init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        super().__init__(init_cfg=init_cfg)
+        self.pretrained = pretrained
+        self.widen_factor = widen_factor
+        self.out_indices = out_indices
+        for index in out_indices:
+            if index not in range(0, 8):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 8). But received {index}')
+
+        if frozen_stages not in range(-1, 9):
+            raise ValueError('frozen_stages must be in range(-1, 9). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = make_divisible(32 * widen_factor, 8)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.layers = []
+
+        for i, layer_cfg in enumerate(self.arch_settings):
+            expand_ratio, channel, num_blocks, stride = layer_cfg
+            out_channels = make_divisible(channel * widen_factor, 8)
+            inverted_res_layer = self.make_layer(
+                out_channels=out_channels,
+                num_blocks=num_blocks,
+                stride=stride,
+                expand_ratio=expand_ratio)
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, inverted_res_layer)
+            self.layers.append(layer_name)
+
+        if widen_factor > 1.0:
+            self.out_channel = int(1280 * widen_factor)
+        else:
+            self.out_channel = 1280
+
+        layer = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.add_module('conv2', layer)
+        self.layers.append('conv2')
+
+    def make_layer(self, out_channels, num_blocks, stride, expand_ratio):
+        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
+
+        Args:
+            out_channels (int): out_channels of block.
+            num_blocks (int): number of blocks.
+            stride (int): stride of the first block. Defaults to 1
+            expand_ratio (int): Expand the number of channels of the
+                hidden layer in InvertedResidual by this ratio. Defaults to 6.
+        """
+        layers = []
+        for i in range(num_blocks):
+            if i >= 1:
+                stride = 1
+            layers.append(
+                InvertedResidual(
+                    self.in_channels,
+                    out_channels,
+                    stride,
+                    expand_ratio=expand_ratio,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+
+        Returns:
+            Tensor or Tuple[Tensor]: The feature of the input samples extracted
+            by the backbone.
+        """
+        x = self.conv1(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        """Prevent all the parameters from being optimized before
+        ``self.frozen_stages``."""
+        if self.frozen_stages >= 0:
+            self.conv1.eval()
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer_name = self.layers[i - 1]
+            layer = getattr(self, layer_name)
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        """Set the optimization status when training."""
+        super(MobileNetV2, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmaction/models/backbones/mobilenet_v2_tsm.py b/mmaction/models/backbones/mobilenet_v2_tsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a71284e7e50ab67e890a85734a20b365db9cc92
--- /dev/null
+++ b/mmaction/models/backbones/mobilenet_v2_tsm.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.logging import MMLogger
+from mmengine.runner.checkpoint import _load_checkpoint
+
+from mmaction.registry import MODELS
+from .mobilenet_v2 import InvertedResidual, MobileNetV2
+from .resnet_tsm import TemporalShift
+
+
+@MODELS.register_module()
+class MobileNetV2TSM(MobileNetV2):
+    """MobileNetV2 backbone for TSM.
+
+    Args:
+        num_segments (int): Number of frame segments. Defaults to 8.
+        is_shift (bool): Whether to make temporal shift in reset layers.
+            Defaults to True.
+        shift_div (int): Number of div for shift. Defaults to 8.
+        pretraind2d (bool): Whether to load pretrained 2D model.
+            Defaults to True.
+        **kwargs (keyword arguments, optional): Arguments for MobilNetV2.
+    """
+
+    def __init__(self,
+                 num_segments=8,
+                 is_shift=True,
+                 shift_div=8,
+                 pretrained2d=True,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.num_segments = num_segments
+        self.is_shift = is_shift
+        self.shift_div = shift_div
+        self.pretrained2d = pretrained2d
+        self.init_structure()
+
+    def make_temporal_shift(self):
+        """Make temporal shift for some layers."""
+        for m in self.modules():
+            if isinstance(m, InvertedResidual) and \
+                    len(m.conv) == 3 and m.use_res_connect:
+                m.conv[0] = TemporalShift(
+                    m.conv[0],
+                    num_segments=self.num_segments,
+                    shift_div=self.shift_div,
+                )
+
+    def init_structure(self):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if self.is_shift:
+            self.make_temporal_shift()
+
+    def load_original_weights(self, logger):
+        original_state_dict = _load_checkpoint(
+            self.pretrained, map_location='cpu')
+        if 'state_dict' in original_state_dict:
+            original_state_dict = original_state_dict['state_dict']
+
+        wrapped_layers_map = dict()
+        for name, module in self.named_modules():
+            ori_name = name
+            for wrap_prefix in ['.net']:
+                if wrap_prefix in ori_name:
+                    ori_name = ori_name.replace(wrap_prefix, '')
+                    wrapped_layers_map[ori_name] = name
+
+        # convert wrapped keys
+        for param_name in list(original_state_dict.keys()):
+            layer_name = '.'.join(param_name.split('.')[:-1])
+            if layer_name in wrapped_layers_map:
+                wrapped_name = param_name.replace(
+                    layer_name, wrapped_layers_map[layer_name])
+                original_state_dict[wrapped_name] = original_state_dict.pop(
+                    param_name)
+
+        msg = self.load_state_dict(original_state_dict, strict=True)
+        logger.info(msg)
+
+    def init_weights(self):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if self.pretrained2d:
+            logger = MMLogger.get_current_instance()
+            self.load_original_weights(logger)
+        else:
+            if self.pretrained:
+                self.init_cfg = dict(
+                    type='Pretrained', checkpoint=self.pretrained)
+            super().init_weights()
diff --git a/mmaction/models/backbones/mobileone_tsm.py b/mmaction/models/backbones/mobileone_tsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d12e84408831e90e69e7ca9cee2062de783fcd85
--- /dev/null
+++ b/mmaction/models/backbones/mobileone_tsm.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch.nn as nn
+from mmengine.logging import MMLogger
+from mmengine.runner.checkpoint import (_load_checkpoint,
+                                        _load_checkpoint_with_prefix)
+from mmpretrain.models import MobileOne
+
+from mmaction.registry import MODELS
+from .resnet_tsm import TemporalShift
+
+
+@MODELS.register_module()
+class MobileOneTSM(MobileOne):
+    """MobileOne backbone for TSM.
+
+    Args:
+        arch (str | dict): MobileOne architecture. If use string, choose
+            from 's0', 's1', 's2', 's3' and 's4'. If use dict, it should
+            have below keys:
+
+            - num_blocks (Sequence[int]): Number of blocks in each stage.
+            - width_factor (Sequence[float]): Width factor in each stage.
+            - num_conv_branches (Sequence[int]): Number of conv branches
+              in each stage.
+            - num_se_blocks (Sequence[int]): Number of SE layers in each
+              stage, all the SE layers are placed in the subsequent order
+              in each stage.
+
+            Defaults to 's0'.
+        num_segments (int): Number of frame segments. Defaults to 8.
+        is_shift (bool): Whether to make temporal shift in reset layers.
+            Defaults to True.
+        shift_div (int): Number of div for shift. Defaults to 8.
+        pretraind2d (bool): Whether to load pretrained 2D model.
+            Defaults to True.
+        **kwargs (keyword arguments, optional): Arguments for MobileOne.
+    """
+
+    def __init__(self,
+                 arch: str,
+                 num_segments: int = 8,
+                 is_shift: bool = True,
+                 shift_div: int = 8,
+                 pretrained2d: bool = True,
+                 **kwargs):
+        super().__init__(arch, **kwargs)
+        self.num_segments = num_segments
+        self.is_shift = is_shift
+        self.shift_div = shift_div
+        self.pretrained2d = pretrained2d
+        self.init_structure()
+
+    def make_temporal_shift(self):
+        """Make temporal shift for some layers.
+
+        To make reparameterization work, we can only build the shift layer
+        before the 'block', instead of the 'blockres'
+        """
+
+        def make_block_temporal(stage, num_segments):
+            """Make temporal shift on some blocks.
+
+            Args:
+                stage (nn.Module): Model layers to be shifted.
+                num_segments (int): Number of frame segments.
+
+            Returns:
+                nn.Module: The shifted blocks.
+            """
+            blocks = list(stage.children())
+            for i, b in enumerate(blocks):
+                blocks[i] = TemporalShift(
+                    b, num_segments=num_segments, shift_div=self.shift_div)
+            return nn.Sequential(*blocks)
+
+        self.stage0 = make_block_temporal(
+            nn.Sequential(self.stage0), self.num_segments)[0]
+        for i in range(1, 5):
+            temporal_stage = make_block_temporal(
+                getattr(self, f'stage{i}'), self.num_segments)
+            setattr(self, f'stage{i}', temporal_stage)
+
+    def init_structure(self):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if self.is_shift:
+            self.make_temporal_shift()
+
+    def load_original_weights(self, logger):
+        assert self.init_cfg.get('type') == 'Pretrained', (
+            'Please specify '
+            'init_cfg to use pretrained 2d checkpoint')
+        self.pretrained = self.init_cfg.get('checkpoint')
+        prefix = self.init_cfg.get('prefix')
+        if prefix is not None:
+            original_state_dict = _load_checkpoint_with_prefix(
+                prefix, self.pretrained, map_location='cpu')
+        else:
+            original_state_dict = _load_checkpoint(
+                self.pretrained, map_location='cpu')
+        if 'state_dict' in original_state_dict:
+            original_state_dict = original_state_dict['state_dict']
+
+        wrapped_layers_map = dict()
+        for name, module in self.named_modules():
+            ori_name = name
+            for wrap_prefix in ['.net']:
+                if wrap_prefix in ori_name:
+                    ori_name = ori_name.replace(wrap_prefix, '')
+                    wrapped_layers_map[ori_name] = name
+
+        # convert wrapped keys
+        for param_name in list(original_state_dict.keys()):
+            layer_name = '.'.join(param_name.split('.')[:-1])
+            if layer_name in wrapped_layers_map:
+                wrapped_name = param_name.replace(
+                    layer_name, wrapped_layers_map[layer_name])
+                original_state_dict[wrapped_name] = original_state_dict.pop(
+                    param_name)
+
+        msg = self.load_state_dict(original_state_dict, strict=True)
+        logger.info(msg)
+
+    def init_weights(self):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if self.pretrained2d:
+            logger = MMLogger.get_current_instance()
+            self.load_original_weights(logger)
+        else:
+            super().init_weights()
+
+    def forward(self, x):
+        """unpack tuple result."""
+        x = super().forward(x)
+        if isinstance(x, tuple):
+            assert len(x) == 1
+            x = x[0]
+        return x
diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb808a8c0230e04bd5f06c7c9275486cfc26cbf1
--- /dev/null
+++ b/mmaction/models/backbones/mvit.py
@@ -0,0 +1,909 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks import DropPath
+from mmengine.logging import MMLogger
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import trunc_normal_
+from mmengine.runner.checkpoint import _load_checkpoint_with_prefix
+from mmengine.utils import to_3tuple
+
+from mmaction.registry import MODELS
+from mmaction.utils import get_str_type
+from ..utils.embed import PatchEmbed3D
+
+
+def resize_pos_embed(pos_embed: torch.Tensor,
+                     src_shape: Tuple[int],
+                     dst_shape: Tuple[int],
+                     mode: str = 'trilinear',
+                     num_extra_tokens: int = 1) -> torch.Tensor:
+    """Resize pos_embed weights.
+
+    Args:
+        pos_embed (torch.Tensor): Position embedding weights with shape
+            [1, L, C].
+        src_shape (tuple): The resolution of downsampled origin training
+            image, in format (T, H, W).
+        dst_shape (tuple): The resolution of downsampled new training
+            image, in format (T, H, W).
+        mode (str): Algorithm used for upsampling. Choose one from 'nearest',
+            'linear', 'bilinear', 'bicubic' and 'trilinear'.
+            Defaults to 'trilinear'.
+        num_extra_tokens (int): The number of extra tokens, such as cls_token.
+            Defaults to 1.
+
+    Returns:
+        torch.Tensor: The resized pos_embed of shape [1, L_new, C]
+    """
+    if src_shape[0] == dst_shape[0] and src_shape[1] == dst_shape[1] \
+            and src_shape[2] == dst_shape[2]:
+        return pos_embed
+    assert pos_embed.ndim == 3, 'shape of pos_embed must be [1, L, C]'
+    _, L, C = pos_embed.shape
+    src_t, src_h, src_w = src_shape
+    assert L == src_t * src_h * src_w + num_extra_tokens, \
+        f"The length of `pos_embed` ({L}) doesn't match the expected " \
+        f'shape ({src_t}*{src_h}*{src_w}+{num_extra_tokens}).' \
+        'Please check the `img_size` argument.'
+    extra_tokens = pos_embed[:, :num_extra_tokens]
+
+    src_weight = pos_embed[:, num_extra_tokens:]
+    src_weight = src_weight.reshape(1, src_t, src_h, src_w,
+                                    C).permute(0, 4, 1, 2, 3)
+
+    dst_weight = F.interpolate(
+        src_weight, size=dst_shape, align_corners=False, mode=mode)
+    dst_weight = torch.flatten(dst_weight, 2).transpose(1, 2)
+
+    return torch.cat((extra_tokens, dst_weight), dim=1)
+
+
+def resize_decomposed_rel_pos(rel_pos: torch.Tensor, q_size: int,
+                              k_size: int) -> torch.Tensor:
+    """Get relative positional embeddings according to the relative positions
+    of query and key sizes.
+
+    Args:
+        rel_pos (Tensor): relative position embeddings (L, C).
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        resized = F.interpolate(
+            # (L, C) -> (1, C, L)
+            rel_pos.transpose(0, 1).unsqueeze(0),
+            size=max_rel_dist,
+            mode='linear',
+        )
+        # (1, C, L) -> (L, C)
+        resized = resized.squeeze(0).transpose(0, 1)
+    else:
+        resized = rel_pos
+
+    # Scale the coords with short length if shapes for q and k are different.
+    q_h_ratio = max(k_size / q_size, 1.0)
+    k_h_ratio = max(q_size / k_size, 1.0)
+    q_coords = torch.arange(q_size)[:, None] * q_h_ratio
+    k_coords = torch.arange(k_size)[None, :] * k_h_ratio
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * k_h_ratio
+
+    return resized[relative_coords.long()]
+
+
+def add_decomposed_rel_pos(attn: torch.Tensor,
+                           q: torch.Tensor,
+                           q_shape: Sequence[int],
+                           k_shape: Sequence[int],
+                           rel_pos_h: torch.Tensor,
+                           rel_pos_w: torch.Tensor,
+                           rel_pos_t: torch.Tensor,
+                           with_cls_token: bool = False) -> torch.Tensor:
+    """Spatiotemporal Relative Positional Embeddings."""
+    sp_idx = 1 if with_cls_token else 0
+    B, num_heads, _, C = q.shape
+    q_t, q_h, q_w = q_shape
+    k_t, k_h, k_w = k_shape
+
+    Rt = resize_decomposed_rel_pos(rel_pos_t, q_t, k_t)
+    Rh = resize_decomposed_rel_pos(rel_pos_h, q_h, k_h)
+    Rw = resize_decomposed_rel_pos(rel_pos_w, q_w, k_w)
+
+    r_q = q[:, :, sp_idx:].reshape(B, num_heads, q_t, q_h, q_w, C)
+    rel_t = torch.einsum('bythwc,tkc->bythwk', r_q, Rt)
+    rel_h = torch.einsum('bythwc,hkc->bythwk', r_q, Rh)
+    rel_w = torch.einsum('bythwc,wkc->bythwk', r_q, Rw)
+    rel_pos_embed = (
+        rel_t[:, :, :, :, :, :, None, None] +
+        rel_h[:, :, :, :, :, None, :, None] +
+        rel_w[:, :, :, :, :, None, None, :])
+
+    attn_map = attn[:, :, sp_idx:, sp_idx:].view(B, -1, q_t, q_h, q_w, k_t,
+                                                 k_h, k_w)
+    attn_map += rel_pos_embed
+    attn[:, :, sp_idx:, sp_idx:] = attn_map.view(B, -1, q_t * q_h * q_w,
+                                                 k_t * k_h * k_w)
+
+    return attn
+
+
+class MLP(BaseModule):
+    """Two-layer multilayer perceptron.
+
+    Comparing with :class:`mmcv.cnn.bricks.transformer.FFN`, this class allows
+    different input and output channel numbers.
+
+    Args:
+        in_channels (int): The number of input channels.
+        hidden_channels (int, optional): The number of hidden layer channels.
+            If None, same as the ``in_channels``. Defaults to None.
+        out_channels (int, optional): The number of output channels. If None,
+            same as the ``in_channels``. Defaults to None.
+        act_cfg (dict): The config of activation function.
+            Defaults to ``dict(type='GELU')``.
+        init_cfg (dict, optional): The config of weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 hidden_channels: Optional[int] = None,
+                 out_channels: Optional[int] = None,
+                 act_cfg: Dict = dict(type='GELU'),
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        out_channels = out_channels or in_channels
+        hidden_channels = hidden_channels or in_channels
+        self.fc1 = nn.Linear(in_channels, hidden_channels)
+        self.act = build_activation_layer(act_cfg)
+        self.fc2 = nn.Linear(hidden_channels, out_channels)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+
+
+def attention_pool(x: torch.Tensor,
+                   pool: nn.Module,
+                   in_size: Tuple[int],
+                   with_cls_token: bool = False,
+                   norm: Optional[nn.Module] = None) -> tuple:
+    """Pooling the feature tokens.
+
+    Args:
+        x (torch.Tensor): The input tensor, should be with shape
+            ``(B, num_heads, L, C)`` or ``(B, L, C)``.
+        pool (nn.Module): The pooling module.
+        in_size (Tuple[int]): The shape of the input feature map.
+        with_cls_token (bool): Whether concatenating class token into video
+            tokens as transformer input. Defaults to True.
+        norm (nn.Module, optional): The normalization module.
+            Defaults to None.
+    """
+    ndim = x.ndim
+    if ndim == 4:
+        B, num_heads, L, C = x.shape
+    elif ndim == 3:
+        num_heads = 1
+        B, L, C = x.shape
+        x = x.unsqueeze(1)
+    else:
+        raise RuntimeError(f'Unsupported input dimension {x.shape}')
+
+    T, H, W = in_size
+    assert L == T * H * W + with_cls_token
+
+    if with_cls_token:
+        cls_tok, x = x[:, :, :1, :], x[:, :, 1:, :]
+
+    # (B, num_heads, T*H*W, C) -> (B*num_heads, C, T, H, W)
+    x = x.reshape(B * num_heads, T, H, W, C).permute(0, 4, 1, 2,
+                                                     3).contiguous()
+    x = pool(x)
+    out_size = x.shape[2:]
+
+    # (B*num_heads, C, T', H', W') -> (B, num_heads, T'*H'*W', C)
+    x = x.reshape(B, num_heads, C, -1).transpose(2, 3)
+
+    if with_cls_token:
+        x = torch.cat((cls_tok, x), dim=2)
+
+    if norm is not None:
+        x = norm(x)
+
+    if ndim == 3:
+        x = x.squeeze(1)
+
+    return x, out_size
+
+
+class MultiScaleAttention(BaseModule):
+    """Multiscale Multi-head Attention block.
+
+    Args:
+        in_dims (int): Number of input channels.
+        out_dims (int): Number of output channels.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool): If True, add a learnable bias to query, key and
+            value. Defaults to True.
+        norm_cfg (dict): The config of normalization layers.
+            Defaults to ``dict(type='LN')``.
+        pool_kernel (tuple): kernel size for qkv pooling layers.
+            Defaults to (3, 3, 3).
+        stride_q (int): stride size for q pooling layer.
+            Defaults to (1, 1, 1).
+        stride_kv (int): stride size for kv pooling layer.
+            Defaults to (1, 1, 1).
+        rel_pos_embed (bool): Whether to enable the spatial and temporal
+            relative position embedding. Defaults to True.
+        residual_pooling (bool): Whether to enable the residual connection
+            after attention pooling. Defaults to True.
+        input_size (Tuple[int], optional): The input resolution, necessary
+            if enable the ``rel_pos_embed``. Defaults to None.
+        rel_pos_zero_init (bool): If True, zero initialize relative
+            positional parameters. Defaults to False.
+        with_cls_token (bool): Whether concatenating class token into video
+            tokens as transformer input. Defaults to True.
+        init_cfg (dict, optional): The config of weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_dims: int,
+                 out_dims: int,
+                 num_heads: int,
+                 qkv_bias: bool = True,
+                 norm_cfg: Dict = dict(type='LN'),
+                 pool_kernel: Tuple[int] = (3, 3, 3),
+                 stride_q: Tuple[int] = (1, 1, 1),
+                 stride_kv: Tuple[int] = (1, 1, 1),
+                 rel_pos_embed: bool = True,
+                 residual_pooling: bool = True,
+                 input_size: Optional[Tuple[int]] = None,
+                 rel_pos_zero_init: bool = False,
+                 with_cls_token: bool = True,
+                 init_cfg: Optional[dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_heads = num_heads
+        self.with_cls_token = with_cls_token
+        self.in_dims = in_dims
+        self.out_dims = out_dims
+
+        head_dim = out_dims // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(in_dims, out_dims * 3, bias=qkv_bias)
+        self.proj = nn.Linear(out_dims, out_dims)
+
+        # qkv pooling
+        pool_padding = [k // 2 for k in pool_kernel]
+        pool_dims = out_dims // num_heads
+
+        def build_pooling(stride):
+            pool = nn.Conv3d(
+                pool_dims,
+                pool_dims,
+                pool_kernel,
+                stride=stride,
+                padding=pool_padding,
+                groups=pool_dims,
+                bias=False,
+            )
+            norm = build_norm_layer(norm_cfg, pool_dims)[1]
+            return pool, norm
+
+        self.pool_q, self.norm_q = build_pooling(stride_q)
+        self.pool_k, self.norm_k = build_pooling(stride_kv)
+        self.pool_v, self.norm_v = build_pooling(stride_kv)
+
+        self.residual_pooling = residual_pooling
+
+        self.rel_pos_embed = rel_pos_embed
+        self.rel_pos_zero_init = rel_pos_zero_init
+        if self.rel_pos_embed:
+            # initialize relative positional embeddings
+            assert input_size[1] == input_size[2]
+
+            size = input_size[1]
+            rel_dim = 2 * max(size // stride_q[1], size // stride_kv[1]) - 1
+            self.rel_pos_h = nn.Parameter(torch.zeros(rel_dim, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(rel_dim, head_dim))
+            self.rel_pos_t = nn.Parameter(
+                torch.zeros(2 * input_size[0] - 1, head_dim))
+
+    def init_weights(self) -> None:
+        """Weight initialization."""
+        super().init_weights()
+
+        if (isinstance(self.init_cfg, dict)
+                and get_str_type(self.init_cfg['type']) == 'Pretrained'):
+            # Suppress rel_pos_zero_init if use pretrained model.
+            return
+
+        if not self.rel_pos_zero_init:
+            trunc_normal_(self.rel_pos_h, std=0.02)
+            trunc_normal_(self.rel_pos_w, std=0.02)
+            trunc_normal_(self.rel_pos_t, std=0.02)
+
+    def forward(self, x: torch.Tensor, in_size: Tuple[int]) -> tuple:
+        """Forward the MultiScaleAttention."""
+        B, N, _ = x.shape  # (B, H*W, C)
+
+        # qkv: (B, H*W, 3, num_heads, C)
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, -1)
+        # q, k, v: (B, num_heads, H*W, C)
+        q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(0)
+
+        q, q_shape = attention_pool(
+            q,
+            self.pool_q,
+            in_size,
+            norm=self.norm_q,
+            with_cls_token=self.with_cls_token)
+        k, k_shape = attention_pool(
+            k,
+            self.pool_k,
+            in_size,
+            norm=self.norm_k,
+            with_cls_token=self.with_cls_token)
+        v, v_shape = attention_pool(
+            v,
+            self.pool_v,
+            in_size,
+            norm=self.norm_v,
+            with_cls_token=self.with_cls_token)
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        if self.rel_pos_embed:
+            attn = add_decomposed_rel_pos(attn, q, q_shape, k_shape,
+                                          self.rel_pos_h, self.rel_pos_w,
+                                          self.rel_pos_t, self.with_cls_token)
+
+        attn = attn.softmax(dim=-1)
+        x = attn @ v
+
+        if self.residual_pooling:
+            if self.with_cls_token:
+                x[:, :, 1:, :] += q[:, :, 1:, :]
+            else:
+                x = x + q
+
+        # (B, num_heads, H'*W', C'//num_heads) -> (B, H'*W', C')
+        x = x.transpose(1, 2).reshape(B, -1, self.out_dims)
+        x = self.proj(x)
+
+        return x, q_shape
+
+
+class MultiScaleBlock(BaseModule):
+    """Multiscale Transformer blocks.
+
+    Args:
+        in_dims (int): Number of input channels.
+        out_dims (int): Number of output channels.
+        num_heads (int): Number of attention heads.
+        mlp_ratio (float): Ratio of hidden dimensions in MLP layers.
+            Defaults to 4.0.
+        qkv_bias (bool): If True, add a learnable bias to query, key and
+            value. Defaults to True.
+        drop_path (float): Stochastic depth rate. Defaults to 0.
+        norm_cfg (dict): The config of normalization layers.
+            Defaults to ``dict(type='LN')``.
+        act_cfg (dict): The config of activation function.
+            Defaults to ``dict(type='GELU')``.
+        qkv_pool_kernel (tuple): kernel size for qkv pooling layers.
+            Defaults to (3, 3, 3).
+        stride_q (int): stride size for q pooling layer.
+            Defaults to (1, 1, 1).
+        stride_kv (int): stride size for kv pooling layer.
+            Defaults to (1, 1, 1).
+        rel_pos_embed (bool): Whether to enable the spatial relative
+            position embedding. Defaults to True.
+        residual_pooling (bool): Whether to enable the residual connection
+            after attention pooling. Defaults to True.
+        with_cls_token (bool): Whether concatenating class token into video
+            tokens as transformer input. Defaults to True.
+        dim_mul_in_attention (bool): Whether to multiply the ``embed_dims`` in
+            attention layers. If False, multiply it in MLP layers.
+            Defaults to True.
+        input_size (Tuple[int], optional): The input resolution, necessary
+            if enable the ``rel_pos_embed``. Defaults to None.
+        rel_pos_zero_init (bool): If True, zero initialize relative
+            positional parameters. Defaults to False.
+        init_cfg (dict, optional): The config of weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        in_dims: int,
+        out_dims: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop_path: float = 0.0,
+        norm_cfg: Dict = dict(type='LN'),
+        act_cfg: Dict = dict(type='GELU'),
+        qkv_pool_kernel: Tuple = (3, 3, 3),
+        stride_q: Tuple = (1, 1, 1),
+        stride_kv: Tuple = (1, 1, 1),
+        rel_pos_embed: bool = True,
+        residual_pooling: bool = True,
+        with_cls_token: bool = True,
+        dim_mul_in_attention: bool = True,
+        input_size: Optional[Tuple[int]] = None,
+        rel_pos_zero_init: bool = False,
+        init_cfg: Optional[Dict] = None,
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.with_cls_token = with_cls_token
+        self.in_dims = in_dims
+        self.out_dims = out_dims
+        self.norm1 = build_norm_layer(norm_cfg, in_dims)[1]
+        self.dim_mul_in_attention = dim_mul_in_attention
+
+        attn_dims = out_dims if dim_mul_in_attention else in_dims
+        self.attn = MultiScaleAttention(
+            in_dims,
+            attn_dims,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            pool_kernel=qkv_pool_kernel,
+            stride_q=stride_q,
+            stride_kv=stride_kv,
+            rel_pos_embed=rel_pos_embed,
+            residual_pooling=residual_pooling,
+            input_size=input_size,
+            rel_pos_zero_init=rel_pos_zero_init,
+            with_cls_token=with_cls_token)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = build_norm_layer(norm_cfg, attn_dims)[1]
+
+        self.mlp = MLP(
+            in_channels=attn_dims,
+            hidden_channels=int(attn_dims * mlp_ratio),
+            out_channels=out_dims,
+            act_cfg=act_cfg)
+
+        if in_dims != out_dims:
+            self.proj = nn.Linear(in_dims, out_dims)
+        else:
+            self.proj = None
+
+        if np.prod(stride_q) > 1:
+            kernel_skip = [s + 1 if s > 1 else s for s in stride_q]
+            padding_skip = [int(skip // 2) for skip in kernel_skip]
+            self.pool_skip = nn.MaxPool3d(
+                kernel_skip, stride_q, padding_skip, ceil_mode=False)
+
+            if input_size is not None:
+                input_size = to_3tuple(input_size)
+                out_size = [size // s for size, s in zip(input_size, stride_q)]
+                self.init_out_size = out_size
+            else:
+                self.init_out_size = None
+        else:
+            self.pool_skip = None
+            self.init_out_size = input_size
+
+    def forward(self, x: torch.Tensor, in_size: Tuple[int]) -> tuple:
+        x_norm = self.norm1(x)
+        x_attn, out_size = self.attn(x_norm, in_size)
+
+        if self.dim_mul_in_attention and self.proj is not None:
+            skip = self.proj(x_norm)
+        else:
+            skip = x
+
+        if self.pool_skip is not None:
+            skip, _ = attention_pool(
+                skip,
+                self.pool_skip,
+                in_size,
+                with_cls_token=self.with_cls_token)
+
+        x = skip + self.drop_path(x_attn)
+        x_norm = self.norm2(x)
+        x_mlp = self.mlp(x_norm)
+
+        if not self.dim_mul_in_attention and self.proj is not None:
+            skip = self.proj(x_norm)
+        else:
+            skip = x
+
+        x = skip + self.drop_path(x_mlp)
+
+        return x, out_size
+
+
+@MODELS.register_module()
+class MViT(BaseModule):
+    """Multi-scale ViT v2.
+
+    A PyTorch implement of : `MViTv2: Improved Multiscale Vision Transformers
+    for Classification and Detection <https://arxiv.org/abs/2112.01526>`_
+
+    Inspiration from `the official implementation
+    <https://github.com/facebookresearch/SlowFast>`_ and `the mmclassification
+    implementation <https://github.com/open-mmlab/mmclassification>`_
+
+    Args:
+        arch (str | dict): MViT architecture. If use string, choose
+            from 'tiny', 'small', 'base' and 'large'. If use dict, it should
+            have below keys:
+
+            - **embed_dims** (int): The dimensions of embedding.
+            - **num_layers** (int): The number of layers.
+            - **num_heads** (int): The number of heads in attention
+              modules of the initial layer.
+            - **downscale_indices** (List[int]): The layer indices to downscale
+              the feature map.
+
+            Defaults to 'base'.
+        spatial_size (int): The expected input spatial_size shape.
+            Defaults to 224.
+        temporal_size (int): The expected input temporal_size shape.
+            Defaults to 224.
+        in_channels (int): The num of input channels. Defaults to 3.
+        pretrained (str, optional): Name of pretrained model.
+            Defaults to None.
+        pretrained_type (str, optional): Type of pretrained model. choose from
+            'imagenet', 'maskfeat', None. Defaults to None, which means load
+            from same architecture.
+        out_scales (int | Sequence[int]): The output scale indices.
+            They should not exceed the length of ``downscale_indices``.
+            Defaults to -1, which means the last scale.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults to False.
+        interpolate_mode (str): Select the interpolate mode for absolute
+            position embedding vector resize. Defaults to "trilinear".
+        pool_kernel (tuple): kernel size for qkv pooling layers.
+            Defaults to (3, 3, 3).
+        dim_mul (int): The magnification for ``embed_dims`` in the downscale
+            layers. Defaults to 2.
+        head_mul (int): The magnification for ``num_heads`` in the downscale
+            layers. Defaults to 2.
+        adaptive_kv_stride (int): The stride size for kv pooling in the initial
+            layer. Defaults to (1, 8, 8).
+        rel_pos_embed (bool): Whether to enable the spatial and temporal
+            relative position embedding. Defaults to True.
+        residual_pooling (bool): Whether to enable the residual connection
+            after attention pooling. Defaults to True.
+        dim_mul_in_attention (bool): Whether to multiply the ``embed_dims`` in
+            attention layers. If False, multiply it in MLP layers.
+            Defaults to True.
+        with_cls_token (bool): Whether concatenating class token into video
+            tokens as transformer input. Defaults to True.
+        output_cls_token (bool): Whether output the cls_token. If set True,
+            ``with_cls_token`` must be True. Defaults to True.
+        rel_pos_zero_init (bool): If True, zero initialize relative
+            positional parameters. Defaults to False.
+        mlp_ratio (float): Ratio of hidden dimensions in MLP layers.
+            Defaults to 4.0.
+        qkv_bias (bool): enable bias for qkv if True. Defaults to True.
+        norm_cfg (dict): Config dict for normalization layer for all output
+            features. Defaults to ``dict(type='LN', eps=1e-6)``.
+        patch_cfg (dict): Config dict for the patch embedding layer.
+            Defaults to
+            ``dict(kernel_size=(3, 7, 7),
+                   stride=(2, 4, 4),
+                   padding=(1, 3, 3))``.
+        init_cfg (dict, optional): The Config for initialization. Defaults to
+            ``[
+            dict(type='TruncNormal', layer=['Conv2d', 'Conv3d'], std=0.02),
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.02),
+            ]``
+
+    Examples:
+        >>> import torch
+        >>> from mmaction.registry import MODELS
+        >>> from mmaction.utils import register_all_modules
+        >>> register_all_modules()
+        >>>
+        >>> cfg = dict(type='MViT', arch='tiny', out_scales=[0, 1, 2, 3])
+        >>> model = MODELS.build(cfg)
+        >>> model.init_weights()
+        >>> inputs = torch.rand(1, 3, 16, 224, 224)
+        >>> outputs = model(inputs)
+        >>> for i, output in enumerate(outputs):
+        >>>     print(f'scale{i}: {output.shape}')
+        scale0: torch.Size([1, 96, 8, 56, 56])
+        scale1: torch.Size([1, 192, 8, 28, 28])
+        scale2: torch.Size([1, 384, 8, 14, 14])
+        scale3: torch.Size([1, 768, 8, 7, 7])
+    """
+    arch_zoo = {
+        'tiny': {
+            'embed_dims': 96,
+            'num_layers': 10,
+            'num_heads': 1,
+            'downscale_indices': [1, 3, 8]
+        },
+        'small': {
+            'embed_dims': 96,
+            'num_layers': 16,
+            'num_heads': 1,
+            'downscale_indices': [1, 3, 14]
+        },
+        'base': {
+            'embed_dims': 96,
+            'num_layers': 24,
+            'num_heads': 1,
+            'downscale_indices': [2, 5, 21]
+        },
+        'large': {
+            'embed_dims': 144,
+            'num_layers': 48,
+            'num_heads': 2,
+            'downscale_indices': [2, 8, 44]
+        },
+    }
+    num_extra_tokens = 1
+
+    def __init__(
+        self,
+        arch: str = 'base',
+        spatial_size: int = 224,
+        temporal_size: int = 16,
+        in_channels: int = 3,
+        pretrained: Optional[str] = None,
+        pretrained_type: Optional[str] = None,
+        out_scales: Union[int, Sequence[int]] = -1,
+        drop_path_rate: float = 0.,
+        use_abs_pos_embed: bool = False,
+        interpolate_mode: str = 'trilinear',
+        pool_kernel: tuple = (3, 3, 3),
+        dim_mul: int = 2,
+        head_mul: int = 2,
+        adaptive_kv_stride: tuple = (1, 8, 8),
+        rel_pos_embed: bool = True,
+        residual_pooling: bool = True,
+        dim_mul_in_attention: bool = True,
+        with_cls_token: bool = True,
+        output_cls_token: bool = True,
+        rel_pos_zero_init: bool = False,
+        mlp_ratio: float = 4.,
+        qkv_bias: bool = True,
+        norm_cfg: Dict = dict(type='LN', eps=1e-6),
+        patch_cfg: Dict = dict(
+            kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3)),
+        init_cfg: Optional[Union[Dict, List[Dict]]] = [
+            dict(type='TruncNormal', layer=['Conv2d', 'Conv3d'], std=0.02),
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.02),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.02),
+        ]
+    ) -> None:
+        if pretrained:
+            init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        super().__init__(init_cfg=init_cfg.copy())
+        self.pretrained_type = pretrained_type
+
+        if isinstance(arch, str):
+            arch = arch.lower()
+            assert arch in set(self.arch_zoo), \
+                f'Arch {arch} is not in default archs {set(self.arch_zoo)}'
+            self.arch_settings = self.arch_zoo[arch]
+        else:
+            essential_keys = {
+                'embed_dims', 'num_layers', 'num_heads', 'downscale_indices'
+            }
+            assert isinstance(arch, dict) and essential_keys <= set(arch), \
+                f'Custom arch needs a dict with keys {essential_keys}'
+            self.arch_settings = arch
+
+        self.embed_dims = self.arch_settings['embed_dims']
+        self.num_layers = self.arch_settings['num_layers']
+        self.num_heads = self.arch_settings['num_heads']
+        self.downscale_indices = self.arch_settings['downscale_indices']
+        # Defaults take downscale_indices as downscale_indices
+        self.dim_mul_indices = self.arch_settings.get(
+            'dim_mul_indices', self.downscale_indices.copy())
+        self.num_scales = len(self.downscale_indices) + 1
+        self.stage_indices = {
+            index - 1: i
+            for i, index in enumerate(self.downscale_indices)
+        }
+        self.stage_indices[self.num_layers - 1] = self.num_scales - 1
+        self.use_abs_pos_embed = use_abs_pos_embed
+        self.interpolate_mode = interpolate_mode
+
+        if isinstance(out_scales, int):
+            out_scales = [out_scales]
+        assert isinstance(out_scales, Sequence), \
+            f'"out_scales" must by a sequence or int, ' \
+            f'get {type(out_scales)} instead.'
+        for i, index in enumerate(out_scales):
+            if index < 0:
+                out_scales[i] = self.num_scales + index
+            assert 0 <= out_scales[i] <= self.num_scales, \
+                f'Invalid out_scales {index}'
+        self.out_scales = sorted(list(out_scales))
+
+        # Set patch embedding
+        _patch_cfg = dict(
+            in_channels=in_channels,
+            input_size=(temporal_size, spatial_size, spatial_size),
+            embed_dims=self.embed_dims,
+            conv_type='Conv3d',
+        )
+        _patch_cfg.update(patch_cfg)
+        self.patch_embed = PatchEmbed3D(**_patch_cfg)
+        self.patch_resolution = self.patch_embed.init_out_size
+
+        # Set cls token
+        if output_cls_token:
+            assert with_cls_token is True, f'with_cls_token must be True if' \
+                f'set output_cls_token to True, but got {with_cls_token}'
+        self.with_cls_token = with_cls_token
+        self.output_cls_token = output_cls_token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dims))
+
+        # Set absolute position embedding
+        if self.use_abs_pos_embed:
+            num_patches = np.prod(self.patch_resolution)
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches + self.num_extra_tokens,
+                            self.embed_dims))
+
+        # stochastic depth decay rule
+        dpr = np.linspace(0, drop_path_rate, self.num_layers)
+
+        self.blocks = ModuleList()
+        out_dims_list = [self.embed_dims]
+        num_heads = self.num_heads
+        stride_kv = adaptive_kv_stride
+        input_size = self.patch_resolution
+        for i in range(self.num_layers):
+            if i in self.downscale_indices or i in self.dim_mul_indices:
+                num_heads *= head_mul
+
+            if i in self.downscale_indices:
+                stride_q = [1, 2, 2]
+                stride_kv = [max(s // 2, 1) for s in stride_kv]
+            else:
+                stride_q = [1, 1, 1]
+
+            # Set output embed_dims
+            if dim_mul_in_attention and i in self.dim_mul_indices:
+                # multiply embed_dims in dim_mul_indices layers.
+                out_dims = out_dims_list[-1] * dim_mul
+            elif not dim_mul_in_attention and i + 1 in self.dim_mul_indices:
+                # multiply embed_dims before dim_mul_indices layers.
+                out_dims = out_dims_list[-1] * dim_mul
+            else:
+                out_dims = out_dims_list[-1]
+
+            attention_block = MultiScaleBlock(
+                in_dims=out_dims_list[-1],
+                out_dims=out_dims,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_cfg=norm_cfg,
+                qkv_pool_kernel=pool_kernel,
+                stride_q=stride_q,
+                stride_kv=stride_kv,
+                rel_pos_embed=rel_pos_embed,
+                residual_pooling=residual_pooling,
+                with_cls_token=with_cls_token,
+                dim_mul_in_attention=dim_mul_in_attention,
+                input_size=input_size,
+                rel_pos_zero_init=rel_pos_zero_init)
+            self.blocks.append(attention_block)
+
+            input_size = attention_block.init_out_size
+            out_dims_list.append(out_dims)
+
+            if i in self.stage_indices:
+                stage_index = self.stage_indices[i]
+                if stage_index in self.out_scales:
+                    norm_layer = build_norm_layer(norm_cfg, out_dims)[1]
+                    self.add_module(f'norm{stage_index}', norm_layer)
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        # interpolate maskfeat relative position embedding
+        if self.pretrained_type == 'maskfeat':
+            logger = MMLogger.get_current_instance()
+            pretrained = self.init_cfg['checkpoint']
+            logger.info(f'load pretrained model from {pretrained}')
+            state_dict = _load_checkpoint_with_prefix(
+                'backbone.', pretrained, map_location='cpu')
+            attn_rel_pos_keys = [
+                k for k in state_dict.keys() if 'attn.rel_pos' in k
+            ]
+            for k in attn_rel_pos_keys:
+                attn_rel_pos_pretrained = state_dict[k]
+                attn_rel_pos_current = self.state_dict()[k]
+                L1, dim1 = attn_rel_pos_pretrained.size()
+                L2, dim2 = attn_rel_pos_current.size()
+                if dim1 != dim2:
+                    logger.warning(f'Dim mismatch in loading {k}, passing')
+                else:
+                    if L1 != L2:
+                        interp_param = torch.nn.functional.interpolate(
+                            attn_rel_pos_pretrained.t().unsqueeze(0),
+                            size=L2,
+                            mode='linear')
+                        interp_param = \
+                            interp_param.view(dim2, L2).permute(1, 0)
+                        state_dict[k] = interp_param
+                        logger.info(
+                            f'{k} reshaped from {(L1, dim1)} to {L2, dim2}')
+            msg = self.load_state_dict(state_dict, strict=False)
+            logger.info(msg)
+
+        elif self.pretrained_type is None:
+            super().init_weights()
+
+            if (isinstance(self.init_cfg, dict)
+                    and get_str_type(self.init_cfg['type']) == 'Pretrained'):
+                # Suppress default init if use pretrained model.
+                return
+
+        if self.use_abs_pos_embed:
+            trunc_normal_(self.pos_embed, std=0.02)
+
+    def forward(self, x: torch.Tensor) ->\
+            Tuple[Union[torch.Tensor, List[torch.Tensor]]]:
+        """Forward the MViT."""
+        B = x.shape[0]
+        x, patch_resolution = self.patch_embed(x)
+
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+
+        if self.use_abs_pos_embed:
+            x = x + resize_pos_embed(
+                self.pos_embed,
+                self.patch_resolution,
+                patch_resolution,
+                mode=self.interpolate_mode,
+                num_extra_tokens=self.num_extra_tokens)
+
+        if not self.with_cls_token:
+            # Remove class token for transformer encoder input
+            x = x[:, 1:]
+
+        outs = []
+        for i, block in enumerate(self.blocks):
+            x, patch_resolution = block(x, patch_resolution)
+
+            if i in self.stage_indices:
+                stage_index = self.stage_indices[i]
+                if stage_index in self.out_scales:
+                    B, _, C = x.shape
+                    x = getattr(self, f'norm{stage_index}')(x)
+                    tokens = x.transpose(1, 2)
+                    if self.with_cls_token:
+                        patch_token = tokens[:, :, 1:].reshape(
+                            B, C, *patch_resolution)
+                        cls_token = tokens[:, :, 0]
+                    else:
+                        patch_token = tokens.reshape(B, C, *patch_resolution)
+                        cls_token = None
+                    if self.output_cls_token:
+                        out = [patch_token, cls_token]
+                    else:
+                        out = patch_token
+                    outs.append(out)
+
+        return tuple(outs)
diff --git a/mmaction/models/backbones/resnet.py b/mmaction/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c28cc3367f5c08a96338296c325d8604e13f80a
--- /dev/null
+++ b/mmaction/models/backbones/resnet.py
@@ -0,0 +1,625 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.logging import MMLogger
+from mmengine.model import BaseModule
+from mmengine.runner.checkpoint import _load_checkpoint
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+from torch.utils import checkpoint as cp
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType
+
+
+class BasicBlock(nn.Module):
+    """Basic block for ResNet.
+
+    Args:
+        inplanes (int): Number of channels for the input in first conv2d layer.
+        planes (int): Number of channels produced by some norm/conv2d layers.
+        stride (int): Stride in the conv layer. Defaults to 1.
+        dilation (int): Spacing between kernel elements. Defaults to 1.
+        downsample (nn.Module, optional): Downsample layer. Defaults to None.
+        style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Defaults to ``pytorch``.
+        conv_cfg (Union[dict, ConfigDict]): Config for norm layers.
+            Defaults to ``dict(type='Conv')``.
+        norm_cfg (Union[dict, ConfigDict]): Config for norm layers. required
+            keys are ``type`` and ``requires_grad``.
+            Defaults to ``dict(type='BN2d', requires_grad=True)``.
+        act_cfg (Union[dict, ConfigDict]): Config for activate layers.
+            Defaults to ``dict(type='ReLU', inplace=True)``.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+    """
+    expansion = 1
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 conv_cfg: ConfigType = dict(type='Conv'),
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 with_cp: bool = False) -> None:
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+        self.conv1 = ConvModule(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.conv2 = ConvModule(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dilation=1,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.style = style
+        self.stride = stride
+        self.dilation = dilation
+        self.norm_cfg = norm_cfg
+        assert not with_cp
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        identity = x
+
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out = out + identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    """Bottleneck block for ResNet.
+
+    Args:
+        inplanes (int):
+            Number of channels for the input feature in first conv layer.
+        planes (int):
+            Number of channels produced by some norm layes and conv layers.
+        stride (int): Spatial stride in the conv layer. Defaults to 1.
+        dilation (int): Spacing between kernel elements. Defaults to 1.
+        downsample (nn.Module, optional): Downsample layer. Defaults to None.
+        style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Defaults to ``pytorch``.
+        conv_cfg (Union[dict, ConfigDict]): Config for norm layers.
+            Defaults to ``dict(type='Conv')``.
+        norm_cfg (Union[dict, ConfigDict]): Config for norm layers. required
+            keys are ``type`` and ``requires_grad``.
+            Defaults to ``dict(type='BN2d', requires_grad=True)``.
+        act_cfg (Union[dict, ConfigDict]): Config for activate layers.
+            Defaults to ``dict(type='ReLU', inplace=True)``.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+    """
+
+    expansion = 4
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 conv_cfg: ConfigType = dict(type='Conv'),
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 with_cp: bool = False) -> None:
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+        self.inplanes = inplanes
+        self.planes = planes
+        if style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+        self.conv1 = ConvModule(
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = ConvModule(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.conv3 = ConvModule(
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+
+        def _inner_forward(x):
+            """Forward wrapper for utilizing checkpoint."""
+            identity = x
+
+            out = self.conv1(x)
+            out = self.conv2(out)
+            out = self.conv3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out = out + identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def make_res_layer(block: nn.Module,
+                   inplanes: int,
+                   planes: int,
+                   blocks: int,
+                   stride: int = 1,
+                   dilation: int = 1,
+                   style: str = 'pytorch',
+                   conv_cfg: Optional[ConfigType] = None,
+                   norm_cfg: Optional[ConfigType] = None,
+                   act_cfg: Optional[ConfigType] = None,
+                   with_cp: bool = False) -> nn.Module:
+    """Build residual layer for ResNet.
+
+    Args:
+        block: (nn.Module): Residual module to be built.
+        inplanes (int): Number of channels for the input feature in each block.
+        planes (int): Number of channels for the output feature in each block.
+        blocks (int): Number of residual blocks.
+        stride (int): Stride in the conv layer. Defaults to 1.
+        dilation (int): Spacing between kernel elements. Defaults to 1.
+        style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Defaults to ``pytorch``.
+        conv_cfg (Union[dict, ConfigDict], optional): Config for norm layers.
+            Defaults to None.
+        norm_cfg (Union[dict, ConfigDict], optional): Config for norm layers.
+            Defaults to None.
+        act_cfg (Union[dict, ConfigDict], optional): Config for activate
+            layers. Defaults to None.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+
+    Returns:
+        nn.Module: A residual layer for the given config.
+    """
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = ConvModule(
+            inplanes,
+            planes * block.expansion,
+            kernel_size=1,
+            stride=stride,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    layers = []
+    layers.append(
+        block(
+            inplanes,
+            planes,
+            stride,
+            dilation,
+            downsample,
+            style=style,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            with_cp=with_cp))
+    inplanes = planes * block.expansion
+    for _ in range(1, blocks):
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                1,
+                dilation,
+                style=style,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                with_cp=with_cp))
+
+    return nn.Sequential(*layers)
+
+
+@MODELS.register_module()
+class ResNet(BaseModule):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from ``{18, 34, 50, 101, 152}``.
+        pretrained (str, optional): Name of pretrained model. Defaults to None.
+        torchvision_pretrain (bool): Whether to load pretrained model from
+            torchvision. Defaults to True.
+        in_channels (int): Channel num of input features. Defaults to 3.
+        num_stages (int): Resnet stages. Defaults to 4.
+        out_indices (Sequence[int]): Indices of output feature.
+            Defaults to (3, ).
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Defaults to ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Defaults to ``(1, 1, 1, 1)``.
+        style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Defaults to ``pytorch``.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters. Defaults to -1.
+        conv_cfg (dict or ConfigDict): Config for norm layers.
+            Defaults ``dict(type='Conv')``.
+        norm_cfg (Union[dict, ConfigDict]): Config for norm layers. required
+            keys are ``type`` and ``requires_grad``.
+            Defaults to ``dict(type='BN2d', requires_grad=True)``.
+        act_cfg (Union[dict, ConfigDict]): Config for activate layers.
+            Defaults to ``dict(type='ReLU', inplace=True)``.
+        norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze
+            running stats (mean and var). Defaults to False.
+        partial_bn (bool): Whether to use partial bn. Defaults to False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict or list[dict]): Initialization config dict. Defaults to
+            ``[
+            dict(type='Kaiming', layer='Conv2d',),
+            dict(type='Constant', layer='BatchNorm', val=1.)
+            ]``.
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(
+        self,
+        depth: int,
+        pretrained: Optional[str] = None,
+        torchvision_pretrain: bool = True,
+        in_channels: int = 3,
+        num_stages: int = 4,
+        out_indices: Sequence[int] = (3, ),
+        strides: Sequence[int] = (1, 2, 2, 2),
+        dilations: Sequence[int] = (1, 1, 1, 1),
+        style: str = 'pytorch',
+        frozen_stages: int = -1,
+        conv_cfg: ConfigType = dict(type='Conv'),
+        norm_cfg: ConfigType = dict(type='BN2d', requires_grad=True),
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        norm_eval: bool = False,
+        partial_bn: bool = False,
+        with_cp: bool = False,
+        init_cfg: Optional[Union[Dict, List[Dict]]] = [
+            dict(type='Kaiming', layer='Conv2d'),
+            dict(type='Constant', layer='BatchNorm2d', val=1.)
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        self.in_channels = in_channels
+        self.pretrained = pretrained
+        self.torchvision_pretrain = torchvision_pretrain
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.style = style
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.partial_bn = partial_bn
+        self.with_cp = with_cp
+
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = 64
+
+        self._make_stem_layer()
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            planes = 64 * 2**i
+            res_layer = make_res_layer(
+                self.block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                with_cp=with_cp)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self.feat_dim = self.block.expansion * 64 * 2**(
+            len(self.stage_blocks) - 1)
+
+    def _make_stem_layer(self) -> None:
+        """Construct the stem layers consists of a conv+norm+act module and a
+        pooling layer."""
+        self.conv1 = ConvModule(
+            self.in_channels,
+            64,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    @staticmethod
+    def _load_conv_params(conv: nn.Module, state_dict_tv: OrderedDict,
+                          module_name_tv: str,
+                          loaded_param_names: List[str]) -> None:
+        """Load the conv parameters of resnet from torchvision.
+
+        Args:
+            conv (nn.Module): The destination conv module.
+            state_dict_tv (OrderedDict): The state dict of pretrained
+                torchvision model.
+            module_name_tv (str): The name of corresponding conv module in the
+                torchvision model.
+            loaded_param_names (list[str]): List of parameters that have been
+                loaded.
+        """
+
+        weight_tv_name = module_name_tv + '.weight'
+        if conv.weight.data.shape == state_dict_tv[weight_tv_name].shape:
+            conv.weight.data.copy_(state_dict_tv[weight_tv_name])
+            loaded_param_names.append(weight_tv_name)
+
+        if getattr(conv, 'bias') is not None:
+            bias_tv_name = module_name_tv + '.bias'
+            if conv.bias.data.shape == state_dict_tv[bias_tv_name].shape:
+                conv.bias.data.copy_(state_dict_tv[bias_tv_name])
+                loaded_param_names.append(bias_tv_name)
+
+    @staticmethod
+    def _load_bn_params(bn: nn.Module, state_dict_tv: OrderedDict,
+                        module_name_tv: str,
+                        loaded_param_names: List[str]) -> None:
+        """Load the bn parameters of resnet from torchvision.
+
+        Args:
+            bn (nn.Module): The destination bn module.
+            state_dict_tv (OrderedDict): The state dict of pretrained
+                torchvision model.
+            module_name_tv (str): The name of corresponding bn module in the
+                torchvision model.
+            loaded_param_names (list[str]): List of parameters that have been
+                loaded.
+        """
+
+        for param_name, param in bn.named_parameters():
+            param_tv_name = f'{module_name_tv}.{param_name}'
+            param_tv = state_dict_tv[param_tv_name]
+            if param.data.shape == param_tv.shape:
+                param.data.copy_(param_tv)
+                loaded_param_names.append(param_tv_name)
+
+        for param_name, param in bn.named_buffers():
+            param_tv_name = f'{module_name_tv}.{param_name}'
+            # some buffers like num_batches_tracked may not exist
+            if param_tv_name in state_dict_tv:
+                param_tv = state_dict_tv[param_tv_name]
+                if param.data.shape == param_tv.shape:
+                    param.data.copy_(param_tv)
+                    loaded_param_names.append(param_tv_name)
+
+    def _load_torchvision_checkpoint(self,
+                                     logger: mmengine.MMLogger = None) -> None:
+        """Initiate the parameters from torchvision pretrained checkpoint."""
+        state_dict_torchvision = _load_checkpoint(
+            self.pretrained, map_location='cpu')
+        if 'state_dict' in state_dict_torchvision:
+            state_dict_torchvision = state_dict_torchvision['state_dict']
+
+        loaded_param_names = []
+        for name, module in self.named_modules():
+            if isinstance(module, ConvModule):
+                # we use a ConvModule to wrap conv+bn+relu layers, thus the
+                # name mapping is needed
+                if 'downsample' in name:
+                    # layer{X}.{Y}.downsample.conv->layer{X}.{Y}.downsample.0
+                    original_conv_name = name + '.0'
+                    # layer{X}.{Y}.downsample.bn->layer{X}.{Y}.downsample.1
+                    original_bn_name = name + '.1'
+                else:
+                    # layer{X}.{Y}.conv{n}.conv->layer{X}.{Y}.conv{n}
+                    original_conv_name = name
+                    # layer{X}.{Y}.conv{n}.bn->layer{X}.{Y}.bn{n}
+                    original_bn_name = name.replace('conv', 'bn')
+                self._load_conv_params(module.conv, state_dict_torchvision,
+                                       original_conv_name, loaded_param_names)
+                self._load_bn_params(module.bn, state_dict_torchvision,
+                                     original_bn_name, loaded_param_names)
+
+        # check if any parameters in the 2d checkpoint are not loaded
+        remaining_names = set(
+            state_dict_torchvision.keys()) - set(loaded_param_names)
+        if remaining_names:
+            logger.info(
+                f'These parameters in pretrained checkpoint are not loaded'
+                f': {remaining_names}')
+
+    def init_weights(self) -> None:
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            if self.torchvision_pretrain:
+                # torchvision's
+                self._load_torchvision_checkpoint(logger)
+            else:
+                # ours
+                if self.pretrained:
+                    self.init_cfg = dict(
+                        type='Pretrained', checkpoint=self.pretrained)
+                    super().init_weights()
+        elif self.pretrained is None:
+            super().init_weights()
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x: torch.Tensor) \
+            -> Union[torch.Tensor, Tuple[torch.Tensor]]:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            Union[torch.Tensor or Tuple[torch.Tensor]]: The feature of the
+                input samples extracted by the backbone.
+        """
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+
+        return tuple(outs)
+
+    def _freeze_stages(self) -> None:
+        """Prevent all the parameters from being optimized before
+        ``self.frozen_stages``."""
+        if self.frozen_stages >= 0:
+            self.conv1.bn.eval()
+            for m in self.conv1.modules():
+                for param in m.parameters():
+                    param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def _partial_bn(self) -> None:
+        """Freezing BatchNorm2D except the first one."""
+        logger = MMLogger.get_current_instance()
+        logger.info('Freezing BatchNorm2D except the first one.')
+        count_bn = 0
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                count_bn += 1
+                if count_bn >= 2:
+                    m.eval()
+                    # shutdown update in frozen mode
+                    m.weight.requires_grad = False
+                    m.bias.requires_grad = False
+
+    def train(self, mode: bool = True) -> None:
+        """Set the optimization status when training."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+        if mode and self.partial_bn:
+            self._partial_bn()
diff --git a/mmaction/models/backbones/resnet2plus1d.py b/mmaction/models/backbones/resnet2plus1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..61d36e921dbe67b63ab18dbe627fb0136774495a
--- /dev/null
+++ b/mmaction/models/backbones/resnet2plus1d.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmaction.registry import MODELS
+from mmaction.utils import get_str_type
+from .resnet3d import ResNet3d
+
+
+@MODELS.register_module()
+class ResNet2Plus1d(ResNet3d):
+    """ResNet (2+1)d backbone.
+
+    This model is proposed in `A Closer Look at Spatiotemporal Convolutions for
+    Action Recognition <https://arxiv.org/abs/1711.11248>`_
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.pretrained2d is False
+        assert get_str_type(self.conv_cfg['type']) == 'Conv2plus1d'
+
+    def _freeze_stages(self):
+        """Prevent all the parameters from being optimized before
+        ``self.frozen_stages``."""
+        if self.frozen_stages >= 0:
+            self.conv1.eval()
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The feature of the input
+            samples extracted by the backbone.
+        """
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        for layer_name in self.res_layers:
+            res_layer = getattr(self, layer_name)
+            # no pool2 in R(2+1)d
+            x = res_layer(x)
+
+        return x
diff --git a/mmaction/models/backbones/resnet3d.py b/mmaction/models/backbones/resnet3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa764256081a7fee5c6137503978b54201bea98d
--- /dev/null
+++ b/mmaction/models/backbones/resnet3d.py
@@ -0,0 +1,1060 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, NonLocal3d, build_activation_layer
+from mmengine.logging import MMLogger
+from mmengine.model import BaseModule, Sequential
+from mmengine.model.weight_init import constant_init, kaiming_init
+from mmengine.runner.checkpoint import _load_checkpoint, load_checkpoint
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+from torch.nn.modules.utils import _ntuple, _triple
+
+from mmaction.registry import MODELS
+
+
+class BasicBlock3d(BaseModule):
+    """BasicBlock 3d block for ResNet3D.
+
+    Args:
+        inplanes (int): Number of channels for the input in first conv3d layer.
+        planes (int): Number of channels produced by some norm/conv3d layers.
+        spatial_stride (int): Spatial stride in the conv3d layer.
+            Defaults to 1.
+        temporal_stride (int): Temporal stride in the conv3d layer.
+            Defaults to 1.
+        dilation (int): Spacing between kernel elements. Defaults to 1.
+        downsample (nn.Module or None): Downsample layer. Defaults to None.
+        style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Defaults to ``'pytorch'``.
+        inflate (bool): Whether to inflate kernel. Defaults to True.
+        non_local (bool): Determine whether to apply non-local module in this
+            block. Defaults to False.
+        non_local_cfg (dict): Config for non-local module.
+            Defaults to ``dict()``.
+        conv_cfg (dict): Config dict for convolution layer.
+            Defaults to ``dict(type='Conv3d')``.
+        norm_cfg (dict): Config for norm layers.
+            Required keys are ``type``. Defaults to ``dict(type='BN3d')``.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='ReLU')``.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    expansion = 1
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 spatial_stride: int = 1,
+                 temporal_stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 inflate: bool = True,
+                 non_local: bool = False,
+                 non_local_cfg: Dict = dict(),
+                 conv_cfg: Dict = dict(type='Conv3d'),
+                 norm_cfg: Dict = dict(type='BN3d'),
+                 act_cfg: Dict = dict(type='ReLU'),
+                 with_cp: bool = False,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert style in ['pytorch', 'caffe']
+        # make sure that only ``inflate_style`` is passed into kwargs
+        assert set(kwargs).issubset(['inflate_style'])
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.spatial_stride = spatial_stride
+        self.temporal_stride = temporal_stride
+        self.dilation = dilation
+        self.style = style
+        self.inflate = inflate
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.with_cp = with_cp
+        self.non_local = non_local
+        self.non_local_cfg = non_local_cfg
+
+        self.conv1_stride_s = spatial_stride
+        self.conv2_stride_s = 1
+        self.conv1_stride_t = temporal_stride
+        self.conv2_stride_t = 1
+
+        if self.inflate:
+            conv1_kernel_size = (3, 3, 3)
+            conv1_padding = (1, dilation, dilation)
+            conv2_kernel_size = (3, 3, 3)
+            conv2_padding = (1, 1, 1)
+        else:
+            conv1_kernel_size = (1, 3, 3)
+            conv1_padding = (0, dilation, dilation)
+            conv2_kernel_size = (1, 3, 3)
+            conv2_padding = (0, 1, 1)
+
+        self.conv1 = ConvModule(
+            inplanes,
+            planes,
+            conv1_kernel_size,
+            stride=(self.conv1_stride_t, self.conv1_stride_s,
+                    self.conv1_stride_s),
+            padding=conv1_padding,
+            dilation=(1, dilation, dilation),
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.conv2 = ConvModule(
+            planes,
+            planes * self.expansion,
+            conv2_kernel_size,
+            stride=(self.conv2_stride_t, self.conv2_stride_s,
+                    self.conv2_stride_s),
+            padding=conv2_padding,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+
+        self.downsample = downsample
+        self.relu = build_activation_layer(self.act_cfg)
+
+        if self.non_local:
+            self.non_local_block = NonLocal3d(self.conv2.norm.num_features,
+                                              **self.non_local_cfg)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+
+        def _inner_forward(x):
+            """Forward wrapper for utilizing checkpoint."""
+            identity = x
+
+            out = self.conv1(x)
+            out = self.conv2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out = out + identity
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+        out = self.relu(out)
+
+        if self.non_local:
+            out = self.non_local_block(out)
+
+        return out
+
+
+class Bottleneck3d(BaseModule):
+    """Bottleneck 3d block for ResNet3D.
+
+    Args:
+        inplanes (int): Number of channels for the input in first conv3d layer.
+        planes (int): Number of channels produced by some norm/conv3d layers.
+        spatial_stride (int): Spatial stride in the conv3d layer.
+            Defaults to 1.
+        temporal_stride (int): Temporal stride in the conv3d layer.
+            Defaults to 1.
+        dilation (int): Spacing between kernel elements. Defaults to 1.
+        downsample (nn.Module, optional): Downsample layer. Defaults to None.
+        style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Defaults to ``'pytorch'``.
+        inflate (bool): Whether to inflate kernel. Defaults to True.
+        inflate_style (str): '3x1x1' or '3x3x3'. which determines the
+            kernel sizes and padding strides for conv1 and conv2 in each block.
+            Defaults to ``'3x1x1'``.
+        non_local (bool): Determine whether to apply non-local module in this
+            block. Defaults to False.
+        non_local_cfg (dict): Config for non-local module.
+            Defaults to ``dict()``.
+        conv_cfg (dict): Config dict for convolution layer.
+            Defaults to ``dict(type='Conv3d')``.
+        norm_cfg (dict): Config for norm layers. required
+            keys are ``type``. Defaults to ``dict(type='BN3d')``.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='ReLU')``.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 spatial_stride: int = 1,
+                 temporal_stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 inflate: bool = True,
+                 inflate_style: str = '3x1x1',
+                 non_local: bool = False,
+                 non_local_cfg: Dict = dict(),
+                 conv_cfg: Dict = dict(type='Conv3d'),
+                 norm_cfg: Dict = dict(type='BN3d'),
+                 act_cfg: Dict = dict(type='ReLU'),
+                 with_cp: bool = False,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert style in ['pytorch', 'caffe']
+        assert inflate_style in ['3x1x1', '3x3x3']
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.spatial_stride = spatial_stride
+        self.temporal_stride = temporal_stride
+        self.dilation = dilation
+        self.style = style
+        self.inflate = inflate
+        self.inflate_style = inflate_style
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.act_cfg = act_cfg
+        self.with_cp = with_cp
+        self.non_local = non_local
+        self.non_local_cfg = non_local_cfg
+
+        if self.style == 'pytorch':
+            self.conv1_stride_s = 1
+            self.conv2_stride_s = spatial_stride
+            self.conv1_stride_t = 1
+            self.conv2_stride_t = temporal_stride
+        else:
+            self.conv1_stride_s = spatial_stride
+            self.conv2_stride_s = 1
+            self.conv1_stride_t = temporal_stride
+            self.conv2_stride_t = 1
+
+        if self.inflate:
+            if inflate_style == '3x1x1':
+                conv1_kernel_size = (3, 1, 1)
+                conv1_padding = (1, 0, 0)
+                conv2_kernel_size = (1, 3, 3)
+                conv2_padding = (0, dilation, dilation)
+            else:
+                conv1_kernel_size = (1, 1, 1)
+                conv1_padding = (0, 0, 0)
+                conv2_kernel_size = (3, 3, 3)
+                conv2_padding = (1, dilation, dilation)
+        else:
+            conv1_kernel_size = (1, 1, 1)
+            conv1_padding = (0, 0, 0)
+            conv2_kernel_size = (1, 3, 3)
+            conv2_padding = (0, dilation, dilation)
+
+        self.conv1 = ConvModule(
+            inplanes,
+            planes,
+            conv1_kernel_size,
+            stride=(self.conv1_stride_t, self.conv1_stride_s,
+                    self.conv1_stride_s),
+            padding=conv1_padding,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.conv2 = ConvModule(
+            planes,
+            planes,
+            conv2_kernel_size,
+            stride=(self.conv2_stride_t, self.conv2_stride_s,
+                    self.conv2_stride_s),
+            padding=conv2_padding,
+            dilation=(1, dilation, dilation),
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.conv3 = ConvModule(
+            planes,
+            planes * self.expansion,
+            1,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            # No activation in the third ConvModule for bottleneck
+            act_cfg=None)
+
+        self.downsample = downsample
+        self.relu = build_activation_layer(self.act_cfg)
+
+        if self.non_local:
+            self.non_local_block = NonLocal3d(self.conv3.norm.num_features,
+                                              **self.non_local_cfg)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+
+        def _inner_forward(x):
+            """Forward wrapper for utilizing checkpoint."""
+            identity = x
+
+            out = self.conv1(x)
+            out = self.conv2(out)
+            out = self.conv3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out = out + identity
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+        out = self.relu(out)
+
+        if self.non_local:
+            out = self.non_local_block(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResNet3d(BaseModule):
+    """ResNet 3d backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+            Defaults to 50.
+        pretrained (str, optional): Name of pretrained model. Defaults to None.
+        stage_blocks (tuple, optional): Set number of stages for each res
+            layer. Defaults to None.
+        pretrained2d (bool): Whether to load pretrained 2D model.
+            Defaults to True.
+        in_channels (int): Channel num of input features. Defaults to 3.
+        num_stages (int): Resnet stages. Defaults to 4.
+        base_channels (int): Channel num of stem output features.
+            Defaults to 64.
+        out_indices (Sequence[int]): Indices of output feature.
+            Defaults to ``(3, )``.
+        spatial_strides (Sequence[int]):
+            Spatial strides of residual blocks of each stage.
+            Defaults to ``(1, 2, 2, 2)``.
+        temporal_strides (Sequence[int]):
+            Temporal strides of residual blocks of each stage.
+            Defaults to ``(1, 1, 1, 1)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Defaults to ``(1, 1, 1, 1)``.
+        conv1_kernel (Sequence[int]): Kernel size of the first conv layer.
+            Defaults to ``(3, 7, 7)``.
+        conv1_stride_s (int): Spatial stride of the first conv layer.
+            Defaults to 2.
+        conv1_stride_t (int): Temporal stride of the first conv layer.
+            Defaults to 1.
+        pool1_stride_s (int): Spatial stride of the first pooling layer.
+            Defaults to 2.
+        pool1_stride_t (int): Temporal stride of the first pooling layer.
+            Defaults to 1.
+        with_pool2 (bool): Whether to use pool2. Defaults to True.
+        style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Defaults to ``'pytorch'``.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters. Defaults to -1.
+        inflate (Sequence[int]): Inflate Dims of each block.
+            Defaults to ``(1, 1, 1, 1)``.
+        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
+            kernel sizes and padding strides for conv1 and conv2 in each block.
+            Defaults to ``3x1x1``.
+        conv_cfg (dict): Config for conv layers.
+            Required keys are ``type``. Defaults to ``dict(type='Conv3d')``.
+        norm_cfg (dict): Config for norm layers.
+            Required keys are ``type`` and ``requires_grad``.
+            Defaults to ``dict(type='BN3d', requires_grad=True)``.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='ReLU', inplace=True)``.
+        norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze
+            running stats (``mean`` and ``var``). Defaults to False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        non_local (Sequence[int]): Determine whether to apply non-local module
+            in the corresponding block of each stages.
+            Defaults to ``(0, 0, 0, 0)``.
+        non_local_cfg (dict): Config for non-local module.
+            Defaults to ``dict()``.
+        zero_init_residual (bool):
+            Whether to use zero initialization for residual block,
+            Defaults to True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    arch_settings = {
+        18: (BasicBlock3d, (2, 2, 2, 2)),
+        34: (BasicBlock3d, (3, 4, 6, 3)),
+        50: (Bottleneck3d, (3, 4, 6, 3)),
+        101: (Bottleneck3d, (3, 4, 23, 3)),
+        152: (Bottleneck3d, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth: int = 50,
+                 pretrained: Optional[str] = None,
+                 stage_blocks: Optional[Tuple] = None,
+                 pretrained2d: bool = True,
+                 in_channels: int = 3,
+                 num_stages: int = 4,
+                 base_channels: int = 64,
+                 out_indices: Sequence[int] = (3, ),
+                 spatial_strides: Sequence[int] = (1, 2, 2, 2),
+                 temporal_strides: Sequence[int] = (1, 1, 1, 1),
+                 dilations: Sequence[int] = (1, 1, 1, 1),
+                 conv1_kernel: Sequence[int] = (3, 7, 7),
+                 conv1_stride_s: int = 2,
+                 conv1_stride_t: int = 1,
+                 pool1_stride_s: int = 2,
+                 pool1_stride_t: int = 1,
+                 with_pool1: bool = True,
+                 with_pool2: bool = True,
+                 style: str = 'pytorch',
+                 frozen_stages: int = -1,
+                 inflate: Sequence[int] = (1, 1, 1, 1),
+                 inflate_style: str = '3x1x1',
+                 conv_cfg: Dict = dict(type='Conv3d'),
+                 norm_cfg: Dict = dict(type='BN3d', requires_grad=True),
+                 act_cfg: Dict = dict(type='ReLU', inplace=True),
+                 norm_eval: bool = False,
+                 with_cp: bool = False,
+                 non_local: Sequence[int] = (0, 0, 0, 0),
+                 non_local_cfg: Dict = dict(),
+                 zero_init_residual: bool = True,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        self.pretrained = pretrained
+        self.pretrained2d = pretrained2d
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.stage_blocks = stage_blocks
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.spatial_strides = spatial_strides
+        self.temporal_strides = temporal_strides
+        self.dilations = dilations
+        assert len(spatial_strides) == len(temporal_strides) == len(
+            dilations) == num_stages
+        if self.stage_blocks is not None:
+            assert len(self.stage_blocks) == num_stages
+
+        self.conv1_kernel = conv1_kernel
+        self.conv1_stride_s = conv1_stride_s
+        self.conv1_stride_t = conv1_stride_t
+        self.pool1_stride_s = pool1_stride_s
+        self.pool1_stride_t = pool1_stride_t
+        self.with_pool1 = with_pool1
+        self.with_pool2 = with_pool2
+        self.style = style
+        self.frozen_stages = frozen_stages
+        self.stage_inflations = _ntuple(num_stages)(inflate)
+        self.non_local_stages = _ntuple(num_stages)(non_local)
+        self.inflate_style = inflate_style
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+
+        self.block, stage_blocks = self.arch_settings[depth]
+
+        if self.stage_blocks is None:
+            self.stage_blocks = stage_blocks[:num_stages]
+
+        self.inplanes = self.base_channels
+
+        self.non_local_cfg = non_local_cfg
+
+        self._make_stem_layer()
+
+        self.res_layers = []
+        lateral_inplanes = getattr(self, 'lateral_inplanes', [0, 0, 0, 0])
+
+        for i, num_blocks in enumerate(self.stage_blocks):
+            spatial_stride = spatial_strides[i]
+            temporal_stride = temporal_strides[i]
+            dilation = dilations[i]
+            planes = self.base_channels * 2**i
+            res_layer = self.make_res_layer(
+                self.block,
+                self.inplanes + lateral_inplanes[i],
+                planes,
+                num_blocks,
+                spatial_stride=spatial_stride,
+                temporal_stride=temporal_stride,
+                dilation=dilation,
+                style=self.style,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                act_cfg=self.act_cfg,
+                non_local=self.non_local_stages[i],
+                non_local_cfg=self.non_local_cfg,
+                inflate=self.stage_inflations[i],
+                inflate_style=self.inflate_style,
+                with_cp=with_cp,
+                **kwargs)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self.feat_dim = self.block.expansion * \
+            self.base_channels * 2 ** (len(self.stage_blocks) - 1)
+
+    @staticmethod
+    def make_res_layer(block: nn.Module,
+                       inplanes: int,
+                       planes: int,
+                       blocks: int,
+                       spatial_stride: Union[int, Sequence[int]] = 1,
+                       temporal_stride: Union[int, Sequence[int]] = 1,
+                       dilation: int = 1,
+                       style: str = 'pytorch',
+                       inflate: Union[int, Sequence[int]] = 1,
+                       inflate_style: str = '3x1x1',
+                       non_local: Union[int, Sequence[int]] = 0,
+                       non_local_cfg: Dict = dict(),
+                       norm_cfg: Optional[Dict] = None,
+                       act_cfg: Optional[Dict] = None,
+                       conv_cfg: Optional[Dict] = None,
+                       with_cp: bool = False,
+                       **kwargs) -> nn.Module:
+        """Build residual layer for ResNet3D.
+
+        Args:
+            block (nn.Module): Residual module to be built.
+            inplanes (int): Number of channels for the input feature
+                in each block.
+            planes (int): Number of channels for the output feature
+                in each block.
+            blocks (int): Number of residual blocks.
+            spatial_stride (int | Sequence[int]): Spatial strides in
+                residual and conv layers. Defaults to 1.
+            temporal_stride (int | Sequence[int]): Temporal strides in
+                residual and conv layers. Defaults to 1.
+            dilation (int): Spacing between kernel elements. Defaults to 1.
+            style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
+                stride-two layer is the 3x3 conv layer,otherwise the
+                stride-two layer is the first 1x1 conv layer.
+                Defaults to ``'pytorch'``.
+            inflate (int | Sequence[int]): Determine whether to inflate
+                for each block. Defaults to 1.
+            inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines
+                the kernel sizes and padding strides for conv1 and conv2
+                in each block. Default: ``'3x1x1'``.
+            non_local (int | Sequence[int]): Determine whether to apply
+                non-local module in the corresponding block of each stages.
+                Defaults to 0.
+            non_local_cfg (dict): Config for non-local module.
+                Defaults to ``dict()``.
+            conv_cfg (dict, optional): Config for conv layers.
+                Defaults to None.
+            norm_cfg (dict, optional): Config for norm layers.
+                Defaults to None.
+            act_cfg (dict, optional): Config for activate layers.
+                Defaults to None.
+            with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+                will save some memory while slowing down the training speed.
+                Defaults to False.
+
+        Returns:
+            nn.Module: A residual layer for the given config.
+        """
+        inflate = inflate if not isinstance(inflate, int) \
+            else (inflate,) * blocks
+        non_local = non_local if not isinstance(non_local, int) \
+            else (non_local,) * blocks
+        assert len(inflate) == blocks and len(non_local) == blocks
+        downsample = None
+        if spatial_stride != 1 or inplanes != planes * block.expansion:
+            downsample = ConvModule(
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=(temporal_stride, spatial_stride, spatial_stride),
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+
+        layers = []
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                spatial_stride=spatial_stride,
+                temporal_stride=temporal_stride,
+                dilation=dilation,
+                downsample=downsample,
+                style=style,
+                inflate=(inflate[0] == 1),
+                inflate_style=inflate_style,
+                non_local=(non_local[0] == 1),
+                non_local_cfg=non_local_cfg,
+                norm_cfg=norm_cfg,
+                conv_cfg=conv_cfg,
+                act_cfg=act_cfg,
+                with_cp=with_cp,
+                **kwargs))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    spatial_stride=1,
+                    temporal_stride=1,
+                    dilation=dilation,
+                    style=style,
+                    inflate=(inflate[i] == 1),
+                    inflate_style=inflate_style,
+                    non_local=(non_local[i] == 1),
+                    non_local_cfg=non_local_cfg,
+                    norm_cfg=norm_cfg,
+                    conv_cfg=conv_cfg,
+                    act_cfg=act_cfg,
+                    with_cp=with_cp,
+                    **kwargs))
+
+        return Sequential(*layers)
+
+    @staticmethod
+    def _inflate_conv_params(conv3d: nn.Module, state_dict_2d: OrderedDict,
+                             module_name_2d: str,
+                             inflated_param_names: List[str]) -> None:
+        """Inflate a conv module from 2d to 3d.
+
+        Args:
+            conv3d (nn.Module): The destination conv3d module.
+            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
+            module_name_2d (str): The name of corresponding conv module in the
+                2d model.
+            inflated_param_names (list[str]): List of parameters that have been
+                inflated.
+        """
+        weight_2d_name = module_name_2d + '.weight'
+
+        conv2d_weight = state_dict_2d[weight_2d_name]
+        kernel_t = conv3d.weight.data.shape[2]
+
+        new_weight = conv2d_weight.data.unsqueeze(2).expand_as(
+            conv3d.weight) / kernel_t
+        conv3d.weight.data.copy_(new_weight)
+        inflated_param_names.append(weight_2d_name)
+
+        if getattr(conv3d, 'bias') is not None:
+            bias_2d_name = module_name_2d + '.bias'
+            conv3d.bias.data.copy_(state_dict_2d[bias_2d_name])
+            inflated_param_names.append(bias_2d_name)
+
+    @staticmethod
+    def _inflate_bn_params(bn3d: nn.Module, state_dict_2d: OrderedDict,
+                           module_name_2d: str,
+                           inflated_param_names: List[str]) -> None:
+        """Inflate a norm module from 2d to 3d.
+
+        Args:
+            bn3d (nn.Module): The destination bn3d module.
+            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
+            module_name_2d (str): The name of corresponding bn module in the
+                2d model.
+            inflated_param_names (list[str]): List of parameters that have been
+                inflated.
+        """
+        for param_name, param in bn3d.named_parameters():
+            param_2d_name = f'{module_name_2d}.{param_name}'
+            param_2d = state_dict_2d[param_2d_name]
+            if param.data.shape != param_2d.shape:
+                warnings.warn(f'The parameter of {module_name_2d} is not'
+                              'loaded due to incompatible shapes. ')
+                return
+
+            param.data.copy_(param_2d)
+            inflated_param_names.append(param_2d_name)
+
+        for param_name, param in bn3d.named_buffers():
+            param_2d_name = f'{module_name_2d}.{param_name}'
+            # some buffers like num_batches_tracked may not exist in old
+            # checkpoints
+            if param_2d_name in state_dict_2d:
+                param_2d = state_dict_2d[param_2d_name]
+                param.data.copy_(param_2d)
+                inflated_param_names.append(param_2d_name)
+
+    @staticmethod
+    def _inflate_weights(self, logger: MMLogger) -> None:
+        """Inflate the resnet2d parameters to resnet3d.
+
+        The differences between resnet3d and resnet2d mainly lie in an extra
+        axis of conv kernel. To utilize the pretrained parameters in 2d model,
+        the weight of conv2d models should be inflated to fit in the shapes of
+        the 3d counterpart.
+
+        Args:
+            logger (MMLogger): The logger used to print
+                debugging information.
+        """
+
+        state_dict_r2d = _load_checkpoint(self.pretrained, map_location='cpu')
+        if 'state_dict' in state_dict_r2d:
+            state_dict_r2d = state_dict_r2d['state_dict']
+
+        inflated_param_names = []
+        for name, module in self.named_modules():
+            if isinstance(module, ConvModule):
+                # we use a ConvModule to wrap conv+bn+relu layers, thus the
+                # name mapping is needed
+                if 'downsample' in name:
+                    # layer{X}.{Y}.downsample.conv->layer{X}.{Y}.downsample.0
+                    original_conv_name = name + '.0'
+                    # layer{X}.{Y}.downsample.bn->layer{X}.{Y}.downsample.1
+                    original_bn_name = name + '.1'
+                else:
+                    # layer{X}.{Y}.conv{n}.conv->layer{X}.{Y}.conv{n}
+                    original_conv_name = name
+                    # layer{X}.{Y}.conv{n}.bn->layer{X}.{Y}.bn{n}
+                    original_bn_name = name.replace('conv', 'bn')
+                if original_conv_name + '.weight' not in state_dict_r2d:
+                    logger.warning(f'Module not exist in the state_dict_r2d'
+                                   f': {original_conv_name}')
+                else:
+                    shape_2d = state_dict_r2d[original_conv_name +
+                                              '.weight'].shape
+                    shape_3d = module.conv.weight.data.shape
+                    if shape_2d != shape_3d[:2] + shape_3d[3:]:
+                        logger.warning(f'Weight shape mismatch for '
+                                       f': {original_conv_name} : '
+                                       f'3d weight shape: {shape_3d}; '
+                                       f'2d weight shape: {shape_2d}. ')
+                    else:
+                        self._inflate_conv_params(module.conv, state_dict_r2d,
+                                                  original_conv_name,
+                                                  inflated_param_names)
+
+                if original_bn_name + '.weight' not in state_dict_r2d:
+                    logger.warning(f'Module not exist in the state_dict_r2d'
+                                   f': {original_bn_name}')
+                else:
+                    self._inflate_bn_params(module.bn, state_dict_r2d,
+                                            original_bn_name,
+                                            inflated_param_names)
+
+        # check if any parameters in the 2d checkpoint are not loaded
+        remaining_names = set(
+            state_dict_r2d.keys()) - set(inflated_param_names)
+        if remaining_names:
+            logger.info(f'These parameters in the 2d checkpoint are not loaded'
+                        f': {remaining_names}')
+
+    def inflate_weights(self, logger: MMLogger) -> None:
+        """Inflate weights."""
+        self._inflate_weights(self, logger)
+
+    def _make_stem_layer(self) -> None:
+        """Construct the stem layers consists of a conv+norm+act module and a
+        pooling layer."""
+        self.conv1 = ConvModule(
+            self.in_channels,
+            self.base_channels,
+            kernel_size=self.conv1_kernel,
+            stride=(self.conv1_stride_t, self.conv1_stride_s,
+                    self.conv1_stride_s),
+            padding=tuple([(k - 1) // 2 for k in _triple(self.conv1_kernel)]),
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.maxpool = nn.MaxPool3d(
+            kernel_size=(1, 3, 3),
+            stride=(self.pool1_stride_t, self.pool1_stride_s,
+                    self.pool1_stride_s),
+            padding=(0, 1, 1))
+
+        self.pool2 = nn.MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1))
+
+    def _freeze_stages(self) -> None:
+        """Prevent all the parameters from being optimized before
+        ``self.frozen_stages``."""
+        if self.frozen_stages >= 0:
+            self.conv1.eval()
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    @staticmethod
+    def _init_weights(self, pretrained: Optional[str] = None) -> None:
+        """Initiate the parameters either from existing checkpoint or from
+        scratch.
+
+        Args:
+            pretrained (str | None): The path of the pretrained weight. Will
+                override the original `pretrained` if set. The arg is added to
+                be compatible with mmdet. Defaults to None.
+        """
+        if pretrained:
+            self.pretrained = pretrained
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            logger.info(f'load model from: {self.pretrained}')
+
+            if self.pretrained2d:
+                # Inflate 2D model into 3D model.
+                self.inflate_weights(logger)
+            else:
+                # Directly load 3D model.
+                load_checkpoint(
+                    self, self.pretrained, strict=False, logger=logger)
+
+        elif self.pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv3d):
+                    kaiming_init(m)
+                elif isinstance(m, _BatchNorm):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck3d):
+                        constant_init(m.conv3.bn, 0)
+                    elif isinstance(m, BasicBlock3d):
+                        constant_init(m.conv2.bn, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        """Initialize weights."""
+        self._init_weights(self, pretrained)
+
+    def forward(self, x: torch.Tensor) \
+            -> Union[torch.Tensor, Tuple[torch.Tensor]]:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor or tuple[torch.Tensor]: The feature of the input
+            samples extracted by the backbone.
+        """
+        x = self.conv1(x)
+        if self.with_pool1:
+            x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i == 0 and self.with_pool2:
+                x = self.pool2(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+
+        return tuple(outs)
+
+    def train(self, mode: bool = True) -> None:
+        """Set the optimization status when training."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@MODELS.register_module()
+class ResNet3dLayer(BaseModule):
+    """ResNet 3d Layer.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        pretrained (str, optional): Name of pretrained model. Defaults to None.
+        pretrained2d (bool): Whether to load pretrained 2D model.
+            Defaults to True.
+        stage (int): The index of Resnet stage. Defaults to 3.
+        base_channels (int): Channel num of stem output features.
+            Defaults to 64.
+        spatial_stride (int): The 1st res block's spatial stride.
+            Defaults to 2.
+        temporal_stride (int): The 1st res block's temporal stride.
+            Defaults to 1.
+        dilation (int): The dilation. Defaults to 1.
+        style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Defaults to ``'pytorch'``.
+        all_frozen (bool): Frozen all modules in the layer. Defaults to False.
+        inflate (int): Inflate dims of each block. Defaults to 1.
+        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
+            kernel sizes and padding strides for conv1 and conv2 in each block.
+            Defaults to ``'3x1x1'``.
+        conv_cfg (dict): Config for conv layers.
+            Required keys are ``type``. Defaults to ``dict(type='Conv3d')``.
+        norm_cfg (dict): Config for norm layers.
+            Required keys are ``type`` and ``requires_grad``.
+            Defaults to ``dict(type='BN3d', requires_grad=True)``.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='ReLU', inplace=True)``.
+        norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze
+            running stats (``mean`` and ``var``). Defaults to False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        zero_init_residual (bool):
+            Whether to use zero initialization for residual block,
+            Defaults to True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 depth: int,
+                 pretrained: Optional[str] = None,
+                 pretrained2d: bool = True,
+                 stage: int = 3,
+                 base_channels: int = 64,
+                 spatial_stride: int = 2,
+                 temporal_stride: int = 1,
+                 dilation: int = 1,
+                 style: str = 'pytorch',
+                 all_frozen: bool = False,
+                 inflate: int = 1,
+                 inflate_style: str = '3x1x1',
+                 conv_cfg: Dict = dict(type='Conv3d'),
+                 norm_cfg: Dict = dict(type='BN3d', requires_grad=True),
+                 act_cfg: Dict = dict(type='ReLU', inplace=True),
+                 norm_eval: bool = False,
+                 with_cp: bool = False,
+                 zero_init_residual: bool = True,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.arch_settings = ResNet3d.arch_settings
+        assert depth in self.arch_settings
+
+        self.make_res_layer = ResNet3d.make_res_layer
+        self._inflate_conv_params = ResNet3d._inflate_conv_params
+        self._inflate_bn_params = ResNet3d._inflate_bn_params
+        self._inflate_weights = ResNet3d._inflate_weights
+        self._init_weights = ResNet3d._init_weights
+
+        self.depth = depth
+        self.pretrained = pretrained
+        self.pretrained2d = pretrained2d
+        self.stage = stage
+        # stage index is 0 based
+        assert 0 <= stage <= 3
+        self.base_channels = base_channels
+
+        self.spatial_stride = spatial_stride
+        self.temporal_stride = temporal_stride
+        self.dilation = dilation
+
+        self.style = style
+        self.all_frozen = all_frozen
+
+        self.stage_inflation = inflate
+        self.inflate_style = inflate_style
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+
+        block, stage_blocks = self.arch_settings[depth]
+        stage_block = stage_blocks[stage]
+        planes = 64 * 2**stage
+        inplanes = 64 * 2**(stage - 1) * block.expansion
+
+        res_layer = self.make_res_layer(
+            block,
+            inplanes,
+            planes,
+            stage_block,
+            spatial_stride=spatial_stride,
+            temporal_stride=temporal_stride,
+            dilation=dilation,
+            style=self.style,
+            norm_cfg=self.norm_cfg,
+            conv_cfg=self.conv_cfg,
+            act_cfg=self.act_cfg,
+            inflate=self.stage_inflation,
+            inflate_style=self.inflate_style,
+            with_cp=with_cp,
+            **kwargs)
+
+        self.layer_name = f'layer{stage + 1}'
+        self.add_module(self.layer_name, res_layer)
+
+    def inflate_weights(self, logger: MMLogger) -> None:
+        """Inflate weights."""
+        self._inflate_weights(self, logger)
+
+    def _freeze_stages(self) -> None:
+        """Prevent all the parameters from being optimized before
+        ``self.frozen_stages``."""
+        if self.all_frozen:
+            layer = getattr(self, self.layer_name)
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        """Initialize weights."""
+        self._init_weights(self, pretrained)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The feature of the input
+                samples extracted by the residual layer.
+        """
+        res_layer = getattr(self, self.layer_name)
+        out = res_layer(x)
+        return out
+
+    def train(self, mode: bool = True) -> None:
+        """Set the optimization status when training."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmaction/models/backbones/resnet3d_csn.py b/mmaction/models/backbones/resnet3d_csn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d67f6ceadd413c1908b433a610cef2f988ccd49
--- /dev/null
+++ b/mmaction/models/backbones/resnet3d_csn.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.registry import MODELS
+from .resnet3d import Bottleneck3d, ResNet3d
+
+
+class CSNBottleneck3d(Bottleneck3d):
+    """Channel-Separated Bottleneck Block.
+
+    This module is proposed in
+    "Video Classification with Channel-Separated Convolutional Networks"
+    Link: https://arxiv.org/pdf/1711.11248.pdf
+
+    Args:
+        inplanes (int): Number of channels for the input in first conv3d layer.
+        planes (int): Number of channels produced by some norm/conv3d layers.
+        bottleneck_mode (str): Determine which ways to factorize a 3D
+            bottleneck block using channel-separated convolutional networks.
+                If set to 'ip', it will replace the 3x3x3 conv2 layer with a
+                1x1x1 traditional convolution and a 3x3x3 depthwise
+                convolution, i.e., Interaction-preserved channel-separated
+                bottleneck block.
+                If set to 'ir', it will replace the 3x3x3 conv2 layer with a
+                3x3x3 depthwise convolution, which is derived from preserved
+                bottleneck block by removing the extra 1x1x1 convolution,
+                i.e., Interaction-reduced channel-separated bottleneck block.
+            Default: 'ir'.
+        args (position arguments): Position arguments for Bottleneck.
+        kwargs (dict, optional): Keyword arguments for Bottleneck.
+    """
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 *args,
+                 bottleneck_mode='ir',
+                 **kwargs):
+        super(CSNBottleneck3d, self).__init__(inplanes, planes, *args,
+                                              **kwargs)
+        self.bottleneck_mode = bottleneck_mode
+        conv2 = []
+        if self.bottleneck_mode == 'ip':
+            conv2.append(
+                ConvModule(
+                    planes,
+                    planes,
+                    1,
+                    stride=1,
+                    bias=False,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=None))
+        conv2_kernel_size = self.conv2.conv.kernel_size
+        conv2_stride = self.conv2.conv.stride
+        conv2_padding = self.conv2.conv.padding
+        conv2_dilation = self.conv2.conv.dilation
+        conv2_bias = bool(self.conv2.conv.bias)
+        self.conv2 = ConvModule(
+            planes,
+            planes,
+            conv2_kernel_size,
+            stride=conv2_stride,
+            padding=conv2_padding,
+            dilation=conv2_dilation,
+            bias=conv2_bias,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            groups=planes)
+        conv2.append(self.conv2)
+        self.conv2 = nn.Sequential(*conv2)
+
+
+@MODELS.register_module()
+class ResNet3dCSN(ResNet3d):
+    """ResNet backbone for CSN.
+
+    Args:
+        depth (int): Depth of ResNetCSN, from {18, 34, 50, 101, 152}.
+        pretrained (str | None): Name of pretrained model.
+        temporal_strides (tuple[int]):
+            Temporal strides of residual blocks of each stage.
+            Default: (1, 2, 2, 2).
+        conv1_kernel (tuple[int]): Kernel size of the first conv layer.
+            Default: (3, 7, 7).
+        conv1_stride_t (int): Temporal stride of the first conv layer.
+            Default: 1.
+        pool1_stride_t (int): Temporal stride of the first pooling layer.
+            Default: 1.
+        norm_cfg (dict): Config for norm layers. required keys are `type` and
+            `requires_grad`.
+            Default: dict(type='BN3d', requires_grad=True, eps=1e-3).
+        inflate_style (str): `3x1x1` or `3x3x3`. which determines the kernel
+            sizes and padding strides for conv1 and conv2 in each block.
+            Default: '3x3x3'.
+        bottleneck_mode (str): Determine which ways to factorize a 3D
+            bottleneck block using channel-separated convolutional networks.
+                If set to 'ip', it will replace the 3x3x3 conv2 layer with a
+                1x1x1 traditional convolution and a 3x3x3 depthwise
+                convolution, i.e., Interaction-preserved channel-separated
+                bottleneck block.
+                If set to 'ir', it will replace the 3x3x3 conv2 layer with a
+                3x3x3 depthwise convolution, which is derived from preserved
+                bottleneck block by removing the extra 1x1x1 convolution,
+                i.e., Interaction-reduced channel-separated bottleneck block.
+            Default: 'ip'.
+        kwargs (dict, optional): Key arguments for "make_res_layer".
+    """
+
+    def __init__(self,
+                 depth,
+                 pretrained,
+                 temporal_strides=(1, 2, 2, 2),
+                 conv1_kernel=(3, 7, 7),
+                 conv1_stride_t=1,
+                 pool1_stride_t=1,
+                 norm_cfg=dict(type='BN3d', requires_grad=True, eps=1e-3),
+                 inflate_style='3x3x3',
+                 bottleneck_mode='ir',
+                 bn_frozen=False,
+                 **kwargs):
+        self.arch_settings = {
+            # 18: (BasicBlock3d, (2, 2, 2, 2)),
+            # 34: (BasicBlock3d, (3, 4, 6, 3)),
+            50: (CSNBottleneck3d, (3, 4, 6, 3)),
+            101: (CSNBottleneck3d, (3, 4, 23, 3)),
+            152: (CSNBottleneck3d, (3, 8, 36, 3))
+        }
+        self.bn_frozen = bn_frozen
+        if bottleneck_mode not in ['ip', 'ir']:
+            raise ValueError(f'Bottleneck mode must be "ip" or "ir",'
+                             f'but got {bottleneck_mode}.')
+        super(ResNet3dCSN, self).__init__(
+            depth,
+            pretrained,
+            temporal_strides=temporal_strides,
+            conv1_kernel=conv1_kernel,
+            conv1_stride_t=conv1_stride_t,
+            pool1_stride_t=pool1_stride_t,
+            norm_cfg=norm_cfg,
+            inflate_style=inflate_style,
+            bottleneck_mode=bottleneck_mode,
+            **kwargs)
+
+    def train(self, mode=True):
+        """Set the optimization status when training."""
+        super(ResNet3d, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+                    if self.bn_frozen:
+                        for param in m.parameters():
+                            param.requires_grad = False
diff --git a/mmaction/models/backbones/resnet3d_slowfast.py b/mmaction/models/backbones/resnet3d_slowfast.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d069625eed4473fa33117d536bab13e90bd0995
--- /dev/null
+++ b/mmaction/models/backbones/resnet3d_slowfast.py
@@ -0,0 +1,510 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.logging import MMLogger, print_log
+from mmengine.model import BaseModule
+from mmengine.model.weight_init import kaiming_init
+from mmengine.runner.checkpoint import _load_checkpoint, load_checkpoint
+
+from mmaction.registry import MODELS
+from .resnet3d import ResNet3d
+
+
+class DeConvModule(BaseModule):
+    """A deconv module that bundles deconv/norm/activation layers.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+        stride (int | tuple[int]): Stride of the convolution.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input.
+        bias (bool): Whether to add a learnable bias to the output.
+            Defaults to False.
+        with_bn (bool): Whether to add a BN layer. Defaults to True.
+        with_relu (bool): Whether to add a ReLU layer. Defaults to True.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int,
+                 stride: Union[int, Tuple[int]] = (1, 1, 1),
+                 padding: Union[int, Tuple[int]] = 0,
+                 bias: bool = False,
+                 with_bn: bool = True,
+                 with_relu: bool = True) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.bias = bias
+        self.with_bn = with_bn
+        self.with_relu = with_relu
+
+        self.conv = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias)
+        self.bn = nn.BatchNorm3d(out_channels)
+        self.relu = nn.ReLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        # x should be a 5-d tensor
+        assert len(x.shape) == 5
+        N, C, T, H, W = x.shape
+        out_shape = (N, self.out_channels, self.stride[0] * T,
+                     self.stride[1] * H, self.stride[2] * W)
+        x = self.conv(x, output_size=out_shape)
+        if self.with_bn:
+            x = self.bn(x)
+        if self.with_relu:
+            x = self.relu(x)
+        return x
+
+
+class ResNet3dPathway(ResNet3d):
+    """A pathway of Slowfast based on ResNet3d.
+
+    Args:
+        lateral (bool): Determines whether to enable the lateral connection
+            from another pathway. Defaults to False.
+        lateral_inv (bool): Whether to use deconv to upscale the time
+            dimension of features from another pathway. Defaults to False.
+        lateral_norm (bool): Determines whether to enable the lateral norm
+            in lateral layers. Defaults to False.
+        speed_ratio (int): Speed ratio indicating the ratio between time
+            dimension of the fast and slow pathway, corresponding to the
+            ``alpha`` in the paper. Defaults to 8.
+        channel_ratio (int): Reduce the channel number of fast pathway
+            by ``channel_ratio``, corresponding to ``beta`` in the paper.
+            Defaults to 8.
+        fusion_kernel (int): The kernel size of lateral fusion.
+            Defaults to 5.
+        lateral_infl (int): The ratio of the inflated channels.
+            Defaults to 2.
+        lateral_activate (list[int]): Flags for activating the lateral
+            connection. Defaults to ``[1, 1, 1, 1]``.
+    """
+
+    def __init__(self,
+                 lateral: bool = False,
+                 lateral_inv: bool = False,
+                 lateral_norm: bool = False,
+                 speed_ratio: int = 8,
+                 channel_ratio: int = 8,
+                 fusion_kernel: int = 5,
+                 lateral_infl: int = 2,
+                 lateral_activate: List[int] = [1, 1, 1, 1],
+                 **kwargs) -> None:
+        self.lateral = lateral
+        self.lateral_inv = lateral_inv
+        self.lateral_norm = lateral_norm
+        self.speed_ratio = speed_ratio
+        self.channel_ratio = channel_ratio
+        self.fusion_kernel = fusion_kernel
+        self.lateral_infl = lateral_infl
+        self.lateral_activate = lateral_activate
+        self._calculate_lateral_inplanes(kwargs)
+
+        super().__init__(**kwargs)
+        self.inplanes = self.base_channels
+        if self.lateral and self.lateral_activate[0] == 1:
+            if self.lateral_inv:
+                self.conv1_lateral = DeConvModule(
+                    self.inplanes * self.channel_ratio,
+                    self.inplanes * self.channel_ratio // lateral_infl,
+                    kernel_size=(fusion_kernel, 1, 1),
+                    stride=(self.speed_ratio, 1, 1),
+                    padding=((fusion_kernel - 1) // 2, 0, 0),
+                    with_bn=True,
+                    with_relu=True)
+            else:
+                self.conv1_lateral = ConvModule(
+                    self.inplanes // self.channel_ratio,
+                    self.inplanes * lateral_infl // self.channel_ratio,
+                    kernel_size=(fusion_kernel, 1, 1),
+                    stride=(self.speed_ratio, 1, 1),
+                    padding=((fusion_kernel - 1) // 2, 0, 0),
+                    bias=False,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg if self.lateral_norm else None,
+                    act_cfg=self.act_cfg if self.lateral_norm else None)
+
+        self.lateral_connections = []
+        for i in range(len(self.stage_blocks)):
+            planes = self.base_channels * 2**i
+            self.inplanes = planes * self.block.expansion
+
+            if lateral and i != self.num_stages - 1 \
+                    and self.lateral_activate[i + 1]:
+                # no lateral connection needed in final stage
+                lateral_name = f'layer{(i + 1)}_lateral'
+                if self.lateral_inv:
+                    conv_module = DeConvModule(
+                        self.inplanes * self.channel_ratio,
+                        self.inplanes * self.channel_ratio // lateral_infl,
+                        kernel_size=(fusion_kernel, 1, 1),
+                        stride=(self.speed_ratio, 1, 1),
+                        padding=((fusion_kernel - 1) // 2, 0, 0),
+                        bias=False,
+                        with_bn=True,
+                        with_relu=True)
+                else:
+                    conv_module = ConvModule(
+                        self.inplanes // self.channel_ratio,
+                        self.inplanes * lateral_infl // self.channel_ratio,
+                        kernel_size=(fusion_kernel, 1, 1),
+                        stride=(self.speed_ratio, 1, 1),
+                        padding=((fusion_kernel - 1) // 2, 0, 0),
+                        bias=False,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg if self.lateral_norm else None,
+                        act_cfg=self.act_cfg if self.lateral_norm else None)
+                setattr(self, lateral_name, conv_module)
+                self.lateral_connections.append(lateral_name)
+
+    def _calculate_lateral_inplanes(self, kwargs):
+        """Calculate inplanes for lateral connection."""
+        depth = kwargs.get('depth', 50)
+        expansion = 1 if depth < 50 else 4
+        base_channels = kwargs.get('base_channels', 64)
+        lateral_inplanes = []
+        for i in range(kwargs.get('num_stages', 4)):
+            if expansion % 2 == 0:
+                planes = base_channels * (2 ** i) * \
+                         ((expansion // 2) ** (i > 0))
+            else:
+                planes = base_channels * (2**i) // (2**(i > 0))
+            if self.lateral and self.lateral_activate[i]:
+                if self.lateral_inv:
+                    lateral_inplane = planes * \
+                                      self.channel_ratio // self.lateral_infl
+                else:
+                    lateral_inplane = planes * \
+                                      self.lateral_infl // self.channel_ratio
+            else:
+                lateral_inplane = 0
+            lateral_inplanes.append(lateral_inplane)
+        self.lateral_inplanes = lateral_inplanes
+
+    def inflate_weights(self, logger: MMLogger) -> None:
+        """Inflate the resnet2d parameters to resnet3d pathway.
+
+        The differences between resnet3d and resnet2d mainly lie in an extra
+        axis of conv kernel. To utilize the pretrained parameters in 2d model,
+        the weight of conv2d models should be inflated to fit in the shapes of
+        the 3d counterpart. For pathway the ``lateral_connection`` part should
+        not be inflated from 2d weights.
+
+        Args:
+            logger (MMLogger): The logger used to print
+                debugging information.
+        """
+
+        state_dict_r2d = _load_checkpoint(self.pretrained, map_location='cpu')
+        if 'state_dict' in state_dict_r2d:
+            state_dict_r2d = state_dict_r2d['state_dict']
+
+        inflated_param_names = []
+        for name, module in self.named_modules():
+            if 'lateral' in name:
+                continue
+            if isinstance(module, ConvModule):
+                # we use a ConvModule to wrap conv+bn+relu layers, thus the
+                # name mapping is needed
+                if 'downsample' in name:
+                    # layer{X}.{Y}.downsample.conv->layer{X}.{Y}.downsample.0
+                    original_conv_name = name + '.0'
+                    # layer{X}.{Y}.downsample.bn->layer{X}.{Y}.downsample.1
+                    original_bn_name = name + '.1'
+                else:
+                    # layer{X}.{Y}.conv{n}.conv->layer{X}.{Y}.conv{n}
+                    original_conv_name = name
+                    # layer{X}.{Y}.conv{n}.bn->layer{X}.{Y}.bn{n}
+                    original_bn_name = name.replace('conv', 'bn')
+                if original_conv_name + '.weight' not in state_dict_r2d:
+                    logger.warning(f'Module not exist in the state_dict_r2d'
+                                   f': {original_conv_name}')
+                else:
+                    self._inflate_conv_params(module.conv, state_dict_r2d,
+                                              original_conv_name,
+                                              inflated_param_names)
+                if original_bn_name + '.weight' not in state_dict_r2d:
+                    logger.warning(f'Module not exist in the state_dict_r2d'
+                                   f': {original_bn_name}')
+                else:
+                    self._inflate_bn_params(module.bn, state_dict_r2d,
+                                            original_bn_name,
+                                            inflated_param_names)
+
+        # check if any parameters in the 2d checkpoint are not loaded
+        remaining_names = set(
+            state_dict_r2d.keys()) - set(inflated_param_names)
+        if remaining_names:
+            logger.info(f'These parameters in the 2d checkpoint are not loaded'
+                        f': {remaining_names}')
+
+    def _inflate_conv_params(self, conv3d: nn.Module,
+                             state_dict_2d: OrderedDict, module_name_2d: str,
+                             inflated_param_names: List[str]) -> None:
+        """Inflate a conv module from 2d to 3d.
+
+        The differences of conv modules betweene 2d and 3d in Pathway
+        mainly lie in the inplanes due to lateral connections. To fit the
+        shapes of the lateral connection counterpart, it will expand
+        parameters by concatting conv2d parameters and extra zero paddings.
+
+        Args:
+            conv3d (nn.Module): The destination conv3d module.
+            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
+            module_name_2d (str): The name of corresponding conv module in the
+                2d model.
+            inflated_param_names (list[str]): List of parameters that have been
+                inflated.
+        """
+        weight_2d_name = module_name_2d + '.weight'
+        conv2d_weight = state_dict_2d[weight_2d_name]
+        old_shape = conv2d_weight.shape
+        new_shape = conv3d.weight.data.shape
+        kernel_t = new_shape[2]
+
+        if new_shape[1] != old_shape[1]:
+            if new_shape[1] < old_shape[1]:
+                warnings.warn(f'The parameter of {module_name_2d} is not'
+                              'loaded due to incompatible shapes. ')
+                return
+            # Inplanes may be different due to lateral connections
+            new_channels = new_shape[1] - old_shape[1]
+            pad_shape = old_shape
+            pad_shape = pad_shape[:1] + (new_channels, ) + pad_shape[2:]
+            # Expand parameters by concat extra channels
+            conv2d_weight = torch.cat(
+                (conv2d_weight,
+                 torch.zeros(pad_shape).type_as(conv2d_weight).to(
+                     conv2d_weight.device)),
+                dim=1)
+
+        new_weight = conv2d_weight.data.unsqueeze(2).expand_as(
+            conv3d.weight) / kernel_t
+        conv3d.weight.data.copy_(new_weight)
+        inflated_param_names.append(weight_2d_name)
+
+        if getattr(conv3d, 'bias') is not None:
+            bias_2d_name = module_name_2d + '.bias'
+            conv3d.bias.data.copy_(state_dict_2d[bias_2d_name])
+            inflated_param_names.append(bias_2d_name)
+
+    def _freeze_stages(self) -> None:
+        """Prevent all the parameters from being optimized before
+        `self.frozen_stages`."""
+        if self.frozen_stages >= 0:
+            self.conv1.eval()
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+            if i != len(self.res_layers) and self.lateral:
+                # No fusion needed in the final stage
+                lateral_name = self.lateral_connections[i - 1]
+                conv_lateral = getattr(self, lateral_name)
+                conv_lateral.eval()
+                for param in conv_lateral.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if pretrained:
+            self.pretrained = pretrained
+
+        # Override the init_weights of i3d
+        super().init_weights()
+        for module_name in self.lateral_connections:
+            layer = getattr(self, module_name)
+            for m in layer.modules():
+                if isinstance(m, (nn.Conv3d, nn.Conv2d)):
+                    kaiming_init(m)
+
+
+pathway_cfg = {
+    'resnet3d': ResNet3dPathway,
+    # TODO: BNInceptionPathway
+}
+
+
+def build_pathway(cfg: Dict, *args, **kwargs) -> nn.Module:
+    """Build pathway.
+
+    Args:
+        cfg (dict): cfg should contain:
+            - type (str): identify backbone type.
+
+    Returns:
+        nn.Module: Created pathway.
+    """
+    if not (isinstance(cfg, dict) and 'type' in cfg):
+        raise TypeError('cfg must be a dict containing the key "type"')
+    cfg_ = cfg.copy()
+
+    pathway_type = cfg_.pop('type')
+    if pathway_type not in pathway_cfg:
+        raise KeyError(f'Unrecognized pathway type {pathway_type}')
+
+    pathway_cls = pathway_cfg[pathway_type]
+    pathway = pathway_cls(*args, **kwargs, **cfg_)
+
+    return pathway
+
+
+@MODELS.register_module()
+class ResNet3dSlowFast(BaseModule):
+    """Slowfast backbone.
+
+    This module is proposed in `SlowFast Networks for Video Recognition
+    <https://arxiv.org/abs/1812.03982>`_
+
+    Args:
+        pretrained (str): The file path to a pretrained model.
+        resample_rate (int): A large temporal stride ``resample_rate``
+            on input frames. The actual resample rate is calculated by
+            multipling the ``interval`` in ``SampleFrames`` in the
+            pipeline with ``resample_rate``, equivalent to the :math:`\\tau`
+            in the paper, i.e. it processes only one out of
+            ``resample_rate * interval`` frames. Defaults to 8.
+        speed_ratio (int): Speed ratio indicating the ratio between time
+            dimension of the fast and slow pathway, corresponding to the
+            :math:`\\alpha` in the paper. Defaults to 8.
+        channel_ratio (int): Reduce the channel number of fast pathway
+            by ``channel_ratio``, corresponding to :math:`\\beta` in the paper.
+            Defaults to 8.
+        slow_pathway (dict): Configuration of slow branch. Defaults to
+            ``dict(type='resnet3d', depth=50, pretrained=None, lateral=True,
+            conv1_kernel=(1, 7, 7), conv1_stride_t=1, pool1_stride_t=1,
+            inflate=(0, 0, 1, 1))``.
+        fast_pathway (dict): Configuration of fast branch. Defaults to
+            ``dict(type='resnet3d', depth=50, pretrained=None, lateral=False,
+            base_channels=8, conv1_kernel=(5, 7, 7), conv1_stride_t=1,
+            pool1_stride_t=1)``.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 resample_rate: int = 8,
+                 speed_ratio: int = 8,
+                 channel_ratio: int = 8,
+                 slow_pathway: Dict = dict(
+                     type='resnet3d',
+                     depth=50,
+                     pretrained=None,
+                     lateral=True,
+                     conv1_kernel=(1, 7, 7),
+                     conv1_stride_t=1,
+                     pool1_stride_t=1,
+                     inflate=(0, 0, 1, 1)),
+                 fast_pathway: Dict = dict(
+                     type='resnet3d',
+                     depth=50,
+                     pretrained=None,
+                     lateral=False,
+                     base_channels=8,
+                     conv1_kernel=(5, 7, 7),
+                     conv1_stride_t=1,
+                     pool1_stride_t=1),
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.pretrained = pretrained
+        self.resample_rate = resample_rate
+        self.speed_ratio = speed_ratio
+        self.channel_ratio = channel_ratio
+
+        if slow_pathway['lateral']:
+            slow_pathway['speed_ratio'] = speed_ratio
+            slow_pathway['channel_ratio'] = channel_ratio
+
+        self.slow_path = build_pathway(slow_pathway)
+        self.fast_path = build_pathway(fast_pathway)
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if pretrained:
+            self.pretrained = pretrained
+
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            msg = f'load model from: {self.pretrained}'
+            print_log(msg, logger=logger)
+            # Directly load 3D model.
+            load_checkpoint(self, self.pretrained, strict=True, logger=logger)
+        elif self.pretrained is None:
+            # Init two branch separately.
+            self.fast_path.init_weights()
+            self.slow_path.init_weights()
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            tuple[torch.Tensor]: The feature of the input samples
+                extracted by the backbone.
+        """
+        x_slow = nn.functional.interpolate(
+            x,
+            mode='nearest',
+            scale_factor=(1.0 / self.resample_rate, 1.0, 1.0))
+        x_slow = self.slow_path.conv1(x_slow)
+        x_slow = self.slow_path.maxpool(x_slow)
+
+        x_fast = nn.functional.interpolate(
+            x,
+            mode='nearest',
+            scale_factor=(1.0 / (self.resample_rate // self.speed_ratio), 1.0,
+                          1.0))
+        x_fast = self.fast_path.conv1(x_fast)
+        x_fast = self.fast_path.maxpool(x_fast)
+
+        if self.slow_path.lateral:
+            x_fast_lateral = self.slow_path.conv1_lateral(x_fast)
+            x_slow = torch.cat((x_slow, x_fast_lateral), dim=1)
+
+        for i, layer_name in enumerate(self.slow_path.res_layers):
+            res_layer = getattr(self.slow_path, layer_name)
+            x_slow = res_layer(x_slow)
+            res_layer_fast = getattr(self.fast_path, layer_name)
+            x_fast = res_layer_fast(x_fast)
+            if (i != len(self.slow_path.res_layers) - 1
+                    and self.slow_path.lateral):
+                # No fusion needed in the final stage
+                lateral_name = self.slow_path.lateral_connections[i]
+                conv_lateral = getattr(self.slow_path, lateral_name)
+                x_fast_lateral = conv_lateral(x_fast)
+                x_slow = torch.cat((x_slow, x_fast_lateral), dim=1)
+
+        out = (x_slow, x_fast)
+
+        return out
diff --git a/mmaction/models/backbones/resnet3d_slowonly.py b/mmaction/models/backbones/resnet3d_slowonly.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fac766294879890135b549b69aebc21a9fb795c
--- /dev/null
+++ b/mmaction/models/backbones/resnet3d_slowonly.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+from mmaction.registry import MODELS
+from .resnet3d_slowfast import ResNet3dPathway
+
+
+@MODELS.register_module()
+class ResNet3dSlowOnly(ResNet3dPathway):
+    """SlowOnly backbone based on ResNet3dPathway.
+
+    Args:
+        conv1_kernel (Sequence[int]): Kernel size of the first conv layer.
+            Defaults to ``(1, 7, 7)``.
+        conv1_stride_t (int): Temporal stride of the first conv layer.
+            Defaults to 1.
+        pool1_stride_t (int): Temporal stride of the first pooling layer.
+            Defaults to 1.
+        inflate (Sequence[int]): Inflate dims of each block.
+            Defaults to ``(0, 0, 1, 1)``.
+        with_pool2 (bool): Whether to use pool2. Defaults to False.
+    """
+
+    def __init__(self,
+                 conv1_kernel: Sequence[int] = (1, 7, 7),
+                 conv1_stride_t: int = 1,
+                 pool1_stride_t: int = 1,
+                 inflate: Sequence[int] = (0, 0, 1, 1),
+                 with_pool2: bool = False,
+                 **kwargs) -> None:
+        super().__init__(
+            conv1_kernel=conv1_kernel,
+            conv1_stride_t=conv1_stride_t,
+            pool1_stride_t=pool1_stride_t,
+            inflate=inflate,
+            with_pool2=with_pool2,
+            **kwargs)
+
+        assert not self.lateral
diff --git a/mmaction/models/backbones/resnet_audio.py b/mmaction/models/backbones/resnet_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..c44b5d3950a5a250c19b9f8c817abea8a7bfcbba
--- /dev/null
+++ b/mmaction/models/backbones/resnet_audio.py
@@ -0,0 +1,386 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule
+from mmengine.logging import MMLogger
+from mmengine.model.weight_init import constant_init, kaiming_init
+from mmengine.runner import load_checkpoint
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+from torch.nn.modules.utils import _ntuple
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType
+
+
+class Bottleneck2dAudio(nn.Module):
+    """Bottleneck2D block for ResNet2D.
+
+    Args:
+        inplanes (int): Number of channels for the input in first conv3d layer.
+        planes (int): Number of channels produced by some norm/conv3d layers.
+        stride (int): Stride in the conv layer. Defaults to 2.
+        dilation (int): Spacing between kernel elements. Defaults to 1.
+        downsample (nn.Module, optional): Downsample layer. Defaults to None.
+        factorize (bool): Whether to factorize kernel. Defaults to True.
+        norm_cfg (dict): Config for norm layers. required keys are ``type`` and
+            ``requires_grad``. Defaults to None.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the trgaining speed. Defaults to False.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 2,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 factorize: bool = True,
+                 norm_cfg: ConfigType = None,
+                 with_cp: bool = False) -> None:
+        super().__init__()
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.factorize = factorize
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+
+        self.conv1_stride = 1
+        self.conv2_stride = stride
+
+        conv1_kernel_size = (1, 1)
+        conv1_padding = 0
+        conv2_kernel_size = (3, 3)
+        conv2_padding = (dilation, dilation)
+        self.conv1 = ConvModule(
+            inplanes,
+            planes,
+            kernel_size=conv1_kernel_size,
+            padding=conv1_padding,
+            dilation=dilation,
+            norm_cfg=self.norm_cfg,
+            bias=False)
+        self.conv2 = ConvModule(
+            planes,
+            planes,
+            kernel_size=conv2_kernel_size,
+            stride=stride,
+            padding=conv2_padding,
+            dilation=dilation,
+            bias=False,
+            conv_cfg=dict(type='ConvAudio') if factorize else dict(
+                type='Conv'),
+            norm_cfg=None,
+            act_cfg=None)
+        self.conv3 = ConvModule(
+            2 * planes if factorize else planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+
+        def _inner_forward(x):
+            identity = x
+            out = self.conv1(x)
+            out = self.conv2(out)
+            out = self.conv3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResNetAudio(nn.Module):
+    """ResNet 2d audio backbone. Reference:
+
+        <https://arxiv.org/abs/2001.08740>`_.
+
+    Args:
+        depth (int): Depth of resnet, from ``{50, 101, 152}``.
+        pretrained (str, optional): Name of pretrained model. Defaults to None.
+        in_channels (int): Channel num of input features. Defaults to 1.
+        base_channels (int): Channel num of stem output features.
+            Defaults to 32.
+        num_stages (int): Resnet stages. Defaults to 4.
+        strides (Sequence[int]): Strides of residual blocks of each stage.
+            Defaults to ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Defaults to ``(1, 1, 1, 1)``.
+        conv1_kernel (int): Kernel size of the first conv layer. Defaults to 9.
+        conv1_stride (Union[int, Tuple[int]]): Stride of the first conv layer.
+            Defaults to 1.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters. Defaults to -1.
+        factorize (Sequence[int]): factorize Dims of each block for audio.
+            Defaults to ``(1, 1, 0, 0)``.
+        norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze
+            running stats (mean and var). Defaults to False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        conv_cfg (Union[dict, ConfigDict]): Config for norm layers.
+            Defaults to ``dict(type='Conv')``.
+        norm_cfg (Union[dict, ConfigDict]): Config for norm layers. required
+            keys are ``type`` and ``requires_grad``.
+            Defaults to ``dict(type='BN2d', requires_grad=True)``.
+        act_cfg (Union[dict, ConfigDict]): Config for activate layers.
+            Defaults to ``dict(type='ReLU', inplace=True)``.
+        zero_init_residual (bool): Whether to use zero initialization
+            for residual block. Defaults to True.
+    """
+
+    arch_settings = {
+        # 18: (BasicBlock2dAudio, (2, 2, 2, 2)),
+        # 34: (BasicBlock2dAudio, (3, 4, 6, 3)),
+        50: (Bottleneck2dAudio, (3, 4, 6, 3)),
+        101: (Bottleneck2dAudio, (3, 4, 23, 3)),
+        152: (Bottleneck2dAudio, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth: int,
+                 pretrained: str = None,
+                 in_channels: int = 1,
+                 num_stages: int = 4,
+                 base_channels: int = 32,
+                 strides: Sequence[int] = (1, 2, 2, 2),
+                 dilations: Sequence[int] = (1, 1, 1, 1),
+                 conv1_kernel: int = 9,
+                 conv1_stride: int = 1,
+                 frozen_stages: int = -1,
+                 factorize: Sequence[int] = (1, 1, 0, 0),
+                 norm_eval: bool = False,
+                 with_cp: bool = False,
+                 conv_cfg: ConfigType = dict(type='Conv'),
+                 norm_cfg: ConfigType = dict(type='BN2d', requires_grad=True),
+                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 zero_init_residual: bool = True) -> None:
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        self.pretrained = pretrained
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.dilations = dilations
+        self.conv1_kernel = conv1_kernel
+        self.conv1_stride = conv1_stride
+        self.frozen_stages = frozen_stages
+        self.stage_factorization = _ntuple(num_stages)(factorize)
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.zero_init_residual = zero_init_residual
+
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = self.base_channels
+
+        self._make_stem_layer()
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            planes = self.base_channels * 2**i
+            res_layer = self.make_res_layer(
+                self.block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                factorize=self.stage_factorization[i],
+                norm_cfg=self.norm_cfg,
+                with_cp=with_cp)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self.feat_dim = self.block.expansion * self.base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    @staticmethod
+    def make_res_layer(block: nn.Module,
+                       inplanes: int,
+                       planes: int,
+                       blocks: int,
+                       stride: int = 1,
+                       dilation: int = 1,
+                       factorize: int = 1,
+                       norm_cfg: Optional[ConfigType] = None,
+                       with_cp: bool = False) -> nn.Module:
+        """Build residual layer for ResNetAudio.
+
+        Args:
+            block (nn.Module): Residual module to be built.
+            inplanes (int): Number of channels for the input feature
+                in each block.
+            planes (int): Number of channels for the output feature
+                in each block.
+            blocks (int): Number of residual blocks.
+            stride (int): Strides of residual blocks of each stage.
+                Defaults to  1.
+            dilation (int): Spacing between kernel elements. Defaults to 1.
+            factorize (Uninon[int, Sequence[int]]): Determine whether to
+                factorize for each block. Defaults to 1.
+            norm_cfg (Union[dict, ConfigDict], optional): Config for norm
+                layers. Defaults to None.
+            with_cp (bool): Use checkpoint or not. Using checkpoint will save
+                some memory while slowing down the training speed.
+                Defaults to False.
+
+        Returns:
+            nn.Module: A residual layer for the given config.
+        """
+        factorize = factorize if not isinstance(
+            factorize, int) else (factorize, ) * blocks
+        assert len(factorize) == blocks
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = ConvModule(
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+
+        layers = []
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                stride,
+                dilation,
+                downsample,
+                factorize=(factorize[0] == 1),
+                norm_cfg=norm_cfg,
+                with_cp=with_cp))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    1,
+                    dilation,
+                    factorize=(factorize[i] == 1),
+                    norm_cfg=norm_cfg,
+                    with_cp=with_cp))
+
+        return nn.Sequential(*layers)
+
+    def _make_stem_layer(self) -> None:
+        """Construct the stem layers consists of a ``conv+norm+act`` module and
+        a pooling layer."""
+        self.conv1 = ConvModule(
+            self.in_channels,
+            self.base_channels,
+            kernel_size=self.conv1_kernel,
+            stride=self.conv1_stride,
+            bias=False,
+            conv_cfg=dict(type='ConvAudio', op='sum'),
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _freeze_stages(self) -> None:
+        """Prevent all the parameters from being optimized before
+        ``self.frozen_stages``."""
+        if self.frozen_stages >= 0:
+            self.conv1.bn.eval()
+            for m in [self.conv1.conv, self.conv1.bn]:
+                for param in m.parameters():
+                    param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self) -> None:
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            logger.info(f'load model from: {self.pretrained}')
+
+            load_checkpoint(self, self.pretrained, strict=False, logger=logger)
+
+        elif self.pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, _BatchNorm):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck2dAudio):
+                        constant_init(m.conv3.bn, 0)
+
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The feature of the input samples extracted
+                by the backbone.
+        """
+        x = self.conv1(x)
+        for layer_name in self.res_layers:
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+        return x
+
+    def train(self, mode: bool = True) -> None:
+        """Set the optimization status when training."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmaction/models/backbones/resnet_omni.py b/mmaction/models/backbones/resnet_omni.py
new file mode 100644
index 0000000000000000000000000000000000000000..77c5cfac93113a3b280a10ebcefb1a5fa15e8eed
--- /dev/null
+++ b/mmaction/models/backbones/resnet_omni.py
@@ -0,0 +1,255 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModel, BaseModule
+from mmengine.runner import CheckpointLoader
+
+from mmaction.registry import MODELS
+from mmaction.utils import OptConfigType
+
+
+def batch_norm(inputs: torch.Tensor,
+               module: nn.modules.batchnorm,
+               training: Optional[bool] = None) -> torch.Tensor:
+    """Applies Batch Normalization for each channel across a batch of data
+    using params from the given batch normalization module.
+
+    Args:
+        inputs (Tensor): The input data.
+        module (nn.modules.batchnorm): a batch normalization module. Will use
+            params from this batch normalization module to do the operation.
+        training (bool, optional): if true, apply the train mode batch
+            normalization. Defaults to None and will use the training mode of
+            the module.
+    """
+    if training is None:
+        training = module.training
+    return F.batch_norm(
+        input=inputs,
+        running_mean=None if training else module.running_mean,
+        running_var=None if training else module.running_var,
+        weight=module.weight,
+        bias=module.bias,
+        training=training,
+        momentum=module.momentum,
+        eps=module.eps)
+
+
+class BottleNeck(BaseModule):
+    """Building block for Omni-ResNet.
+
+    Args:
+        inplanes (int): Number of channels for the input in first conv layer.
+        planes (int): Number of channels for the input in second conv layer.
+        temporal_kernel (int): Temporal kernel in the conv layer. Should be
+            either 1 or 3. Defaults to 1.
+        spatial_stride (int): Spatial stride in the conv layer. Defaults to 1.
+        init_cfg (dict or ConfigDict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 temporal_kernel: int = 3,
+                 spatial_stride: int = 1,
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super(BottleNeck, self).__init__(init_cfg=init_cfg)
+        assert temporal_kernel in [1, 3]
+
+        self.conv1 = nn.Conv3d(
+            inplanes,
+            planes,
+            kernel_size=(temporal_kernel, 1, 1),
+            padding=(temporal_kernel // 2, 0, 0),
+            bias=False)
+        self.conv2 = nn.Conv3d(
+            planes,
+            planes,
+            stride=(1, spatial_stride, spatial_stride),
+            kernel_size=(1, 3, 3),
+            padding=(0, 1, 1),
+            bias=False)
+
+        self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
+
+        self.bn1 = nn.BatchNorm3d(planes, momentum=0.01)
+        self.bn2 = nn.BatchNorm3d(planes, momentum=0.01)
+        self.bn3 = nn.BatchNorm3d(planes * 4, momentum=0.01)
+
+        if inplanes != planes * 4 or spatial_stride != 1:
+            downsample = [
+                nn.Conv3d(
+                    inplanes,
+                    planes * 4,
+                    kernel_size=1,
+                    stride=(1, spatial_stride, spatial_stride),
+                    bias=False),
+                nn.BatchNorm3d(planes * 4, momentum=0.01)
+            ]
+            self.downsample = nn.Sequential(*downsample)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call.
+
+        Accept both 3D (BCTHW for videos) and 2D (BCHW for images) tensors.
+        """
+        if x.ndim == 4:
+            return self.forward_2d(x)
+
+        # Forward call for 3D tensors.
+        out = self.conv1(x)
+        out = self.bn1(out).relu_()
+
+        out = self.conv2(out)
+        out = self.bn2(out).relu_()
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if hasattr(self, 'downsample'):
+            x = self.downsample(x)
+
+        return out.add_(x).relu_()
+
+    def forward_2d(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward call for 2D tensors."""
+        out = F.conv2d(x, self.conv1.weight.sum(2))
+        out = batch_norm(out, self.bn1).relu_()
+
+        out = F.conv2d(
+            out,
+            self.conv2.weight.squeeze(2),
+            stride=self.conv2.stride[-1],
+            padding=1)
+        out = batch_norm(out, self.bn2).relu_()
+
+        out = F.conv2d(out, self.conv3.weight.squeeze(2))
+        out = batch_norm(out, self.bn3)
+
+        if hasattr(self, 'downsample'):
+            x = F.conv2d(
+                x,
+                self.downsample[0].weight.squeeze(2),
+                stride=self.downsample[0].stride[-1])
+            x = batch_norm(x, self.downsample[1])
+
+        return out.add_(x).relu_()
+
+
+@MODELS.register_module()
+class OmniResNet(BaseModel):
+    """Omni-ResNet that accepts both image and video inputs.
+
+    Args:
+        layers (List[int]): number of layers in each residual stages. Defaults
+            to [3, 4, 6, 3].
+        pretrain_2d (str, optional): path to the 2D pretraining checkpoints.
+            Defaults to None.
+        init_cfg (dict or ConfigDict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 layers: List[int] = [3, 4, 6, 3],
+                 pretrain_2d: Optional[str] = None,
+                 init_cfg: OptConfigType = None) -> None:
+        super(OmniResNet, self).__init__(init_cfg=init_cfg)
+
+        self.inplanes = 64
+        self.conv1 = nn.Conv3d(
+            3,
+            self.inplanes,
+            kernel_size=(1, 7, 7),
+            stride=(1, 2, 2),
+            padding=(0, 3, 3),
+            bias=False)
+        self.bn1 = nn.BatchNorm3d(self.inplanes, momentum=0.01)
+
+        self.pool3d = nn.MaxPool3d((1, 3, 3), (1, 2, 2), (0, 1, 1))
+        self.pool2d = nn.MaxPool2d(3, 2, 1)
+
+        self.temporal_kernel = 1
+        self.layer1 = self._make_layer(64, layers[0])
+        self.layer2 = self._make_layer(128, layers[1], stride=2)
+        self.temporal_kernel = 3
+        self.layer3 = self._make_layer(256, layers[2], stride=2)
+        self.layer4 = self._make_layer(512, layers[3], stride=2)
+
+        if pretrain_2d is not None:
+            self.init_from_2d(pretrain_2d)
+
+    def _make_layer(self,
+                    planes: int,
+                    num_blocks: int,
+                    stride: int = 1) -> nn.Module:
+        layers = [
+            BottleNeck(
+                self.inplanes,
+                planes,
+                spatial_stride=stride,
+                temporal_kernel=self.temporal_kernel)
+        ]
+        self.inplanes = planes * 4
+        for _ in range(1, num_blocks):
+            layers.append(
+                BottleNeck(
+                    self.inplanes,
+                    planes,
+                    temporal_kernel=self.temporal_kernel))
+        return nn.Sequential(*layers)
+
+    def init_from_2d(self, pretrain: str) -> None:
+        param2d = CheckpointLoader.load_checkpoint(
+            pretrain, map_location='cpu')
+        param3d = self.state_dict()
+        for key in param3d:
+            if key in param2d:
+                weight = param2d[key]
+                if weight.ndim == 4:
+                    t = param3d[key].shape[2]
+                    weight = weight.unsqueeze(2)
+                    weight = weight.expand(-1, -1, t, -1, -1)
+                    weight = weight / t
+                param3d[key] = weight
+        self.load_state_dict(param3d)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call.
+
+        Accept both 3D (BCTHW for videos) and 2D (BCHW for images) tensors.
+        """
+        if x.ndim == 4:
+            return self.forward_2d(x)
+
+        # Forward call for 3D tensors.
+        x = self.conv1(x)
+        x = self.bn1(x).relu_()
+        x = self.pool3d(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        return x
+
+    def forward_2d(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward call for 2D tensors."""
+        x = F.conv2d(
+            x,
+            self.conv1.weight.squeeze(2),
+            stride=self.conv1.stride[-1],
+            padding=self.conv1.padding[-1])
+        x = batch_norm(x, self.bn1).relu_()
+        x = self.pool2d(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x
diff --git a/mmaction/models/backbones/resnet_tin.py b/mmaction/models/backbones/resnet_tin.py
new file mode 100644
index 0000000000000000000000000000000000000000..3040e70337511131d6f9519a9dd9d185bc83bcd9
--- /dev/null
+++ b/mmaction/models/backbones/resnet_tin.py
@@ -0,0 +1,370 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.registry import MODELS
+from .resnet_tsm import ResNetTSM
+
+
+def linear_sampler(data, offset):
+    """Differentiable Temporal-wise Frame Sampling, which is essentially a
+    linear interpolation process.
+
+    It gets the feature map which has been split into several groups
+    and shift them by different offsets according to their groups.
+    Then compute the weighted sum along with the temporal dimension.
+
+    Args:
+        data (torch.Tensor): Split data for certain group in shape
+            [N, num_segments, C, H, W].
+        offset (torch.Tensor): Data offsets for this group data in shape
+            [N, num_segments].
+    """
+    # [N, num_segments, C, H, W]
+    n, t, c, h, w = data.shape
+
+    # offset0, offset1: [N, num_segments]
+    offset0 = torch.floor(offset).int()
+    offset1 = offset0 + 1
+
+    # data, data0, data1: [N, num_segments, C, H * W]
+    data = data.view(n, t, c, h * w).contiguous()
+
+    try:
+        from mmcv.ops import tin_shift
+    except (ImportError, ModuleNotFoundError):
+        raise ImportError('Failed to import `tin_shift` from `mmcv.ops`. You '
+                          'will be unable to use TIN. ')
+
+    data0 = tin_shift(data, offset0)
+    data1 = tin_shift(data, offset1)
+
+    # weight0, weight1: [N, num_segments]
+    weight0 = 1 - (offset - offset0.float())
+    weight1 = 1 - weight0
+
+    # weight0, weight1:
+    # [N, num_segments] -> [N, num_segments, C // num_segments] -> [N, C]
+    group_size = offset.shape[1]
+    weight0 = weight0[:, :, None].repeat(1, 1, c // group_size)
+    weight0 = weight0.view(weight0.size(0), -1)
+    weight1 = weight1[:, :, None].repeat(1, 1, c // group_size)
+    weight1 = weight1.view(weight1.size(0), -1)
+
+    # weight0, weight1: [N, C] -> [N, 1, C, 1]
+    weight0 = weight0[:, None, :, None]
+    weight1 = weight1[:, None, :, None]
+
+    # output: [N, num_segments, C, H * W] -> [N, num_segments, C, H, W]
+    output = weight0 * data0 + weight1 * data1
+    output = output.view(n, t, c, h, w)
+
+    return output
+
+
+class CombineNet(nn.Module):
+    """Combine Net.
+
+    It combines Temporal interlace module with some part of ResNet layer.
+
+    Args:
+        net1 (nn.module): Temporal interlace module.
+        net2 (nn.module): Some part of ResNet layer.
+    """
+
+    def __init__(self, net1, net2):
+        super().__init__()
+        self.net1 = net1
+        self.net2 = net2
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        # input shape: [num_batches * num_segments, C, H, W]
+        # output x shape: [num_batches * num_segments, C, H, W]
+        x = self.net1(x)
+        # [num_batches * num_segments, C, H, W]
+        x = self.net2(x)
+        return x
+
+
+class WeightNet(nn.Module):
+    """WeightNet in Temporal interlace module.
+
+    The WeightNet consists of two parts: one convolution layer
+    and a sigmoid function. Following the convolution layer, the sigmoid
+    function and rescale module can scale our output to the range (0, 2).
+    Here we set the initial bias of the convolution layer to 0, and the
+    final initial output will be 1.0.
+
+    Args:
+        in_channels (int): Channel num of input features.
+        groups (int): Number of groups for fc layer outputs.
+    """
+
+    def __init__(self, in_channels, groups):
+        super().__init__()
+        self.sigmoid = nn.Sigmoid()
+        self.groups = groups
+
+        self.conv = nn.Conv1d(in_channels, groups, 3, padding=1)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        # we set the initial bias of the convolution
+        # layer to 0, and the final initial output will be 1.0
+        self.conv.bias.data[...] = 0
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        # calculate weight
+        # [N, C, T]
+        n, _, t = x.shape
+        # [N, groups, T]
+        x = self.conv(x)
+        x = x.view(n, self.groups, t)
+        # [N, T, groups]
+        x = x.permute(0, 2, 1)
+
+        # scale the output to range (0, 2)
+        x = 2 * self.sigmoid(x)
+        # [N, T, groups]
+        return x
+
+
+class OffsetNet(nn.Module):
+    """OffsetNet in Temporal interlace module.
+
+    The OffsetNet consists of one convolution layer and two fc layers
+    with a relu activation following with a sigmoid function. Following
+    the convolution layer, two fc layers and relu are applied to the output.
+    Then, apply the sigmoid function with a multiply factor and a minus 0.5
+    to transform the output to (-4, 4).
+
+    Args:
+        in_channels (int): Channel num of input features.
+        groups (int): Number of groups for fc layer outputs.
+        num_segments (int): Number of frame segments.
+    """
+
+    def __init__(self, in_channels, groups, num_segments):
+        super().__init__()
+        self.sigmoid = nn.Sigmoid()
+        # hard code ``kernel_size`` and ``padding`` according to original repo.
+        kernel_size = 3
+        padding = 1
+
+        self.conv = nn.Conv1d(in_channels, 1, kernel_size, padding=padding)
+        self.fc1 = nn.Linear(num_segments, num_segments)
+        self.relu = nn.ReLU()
+        self.fc2 = nn.Linear(num_segments, groups)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        # The bias of the last fc layer is initialized to
+        # make the post-sigmoid output start from 1
+        self.fc2.bias.data[...] = 0.5108
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        # calculate offset
+        # [N, C, T]
+        n, _, t = x.shape
+        # [N, 1, T]
+        x = self.conv(x)
+        # [N, T]
+        x = x.view(n, t)
+        # [N, T]
+        x = self.relu(self.fc1(x))
+        # [N, groups]
+        x = self.fc2(x)
+        # [N, 1, groups]
+        x = x.view(n, 1, -1)
+
+        # to make sure the output is in (-t/2, t/2)
+        # where t = num_segments = 8
+        x = 4 * (self.sigmoid(x) - 0.5)
+        # [N, 1, groups]
+        return x
+
+
+class TemporalInterlace(nn.Module):
+    """Temporal interlace module.
+
+    This module is proposed in `Temporal Interlacing Network
+    <https://arxiv.org/abs/2001.06499>`_
+
+    Args:
+        in_channels (int): Channel num of input features.
+        num_segments (int): Number of frame segments. Default: 3.
+        shift_div (int): Number of division parts for shift. Default: 1.
+    """
+
+    def __init__(self, in_channels, num_segments=3, shift_div=1):
+        super().__init__()
+        self.num_segments = num_segments
+        self.shift_div = shift_div
+        self.in_channels = in_channels
+        # hard code ``deform_groups`` according to original repo.
+        self.deform_groups = 2
+
+        self.offset_net = OffsetNet(in_channels // shift_div,
+                                    self.deform_groups, num_segments)
+        self.weight_net = WeightNet(in_channels // shift_div,
+                                    self.deform_groups)
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        # x: [N, C, H, W],
+        # where N = num_batches x num_segments, C = shift_div * num_folds
+        n, c, h, w = x.size()
+        num_batches = n // self.num_segments
+        num_folds = c // self.shift_div
+
+        # x_out: [num_batches x num_segments, C, H, W]
+        x_out = torch.zeros((n, c, h, w), device=x.device)
+        # x_descriptor: [num_batches, num_segments, num_folds, H, W]
+        x_descriptor = x[:, :num_folds, :, :].view(num_batches,
+                                                   self.num_segments,
+                                                   num_folds, h, w)
+
+        # x should only obtain information on temporal and channel dimensions
+        # x_pooled: [num_batches, num_segments, num_folds, W]
+        x_pooled = torch.mean(x_descriptor, 3)
+        # x_pooled: [num_batches, num_segments, num_folds]
+        x_pooled = torch.mean(x_pooled, 3)
+        # x_pooled: [num_batches, num_folds, num_segments]
+        x_pooled = x_pooled.permute(0, 2, 1).contiguous()
+
+        # Calculate weight and bias, here groups = 2
+        # x_offset: [num_batches, groups]
+        x_offset = self.offset_net(x_pooled).view(num_batches, -1)
+        # x_weight: [num_batches, num_segments, groups]
+        x_weight = self.weight_net(x_pooled)
+
+        # x_offset: [num_batches, 2 * groups]
+        x_offset = torch.cat([x_offset, -x_offset], 1)
+        # x_shift: [num_batches, num_segments, num_folds, H, W]
+        x_shift = linear_sampler(x_descriptor, x_offset)
+
+        # x_weight: [num_batches, num_segments, groups, 1]
+        x_weight = x_weight[:, :, :, None]
+        # x_weight:
+        # [num_batches, num_segments, groups * 2, c // self.shift_div // 4]
+        x_weight = x_weight.repeat(1, 1, 2, num_folds // 2 // 2)
+        # x_weight:
+        # [num_batches, num_segments, c // self.shift_div = num_folds]
+        x_weight = x_weight.view(x_weight.size(0), x_weight.size(1), -1)
+
+        # x_weight: [num_batches, num_segments, num_folds, 1, 1]
+        x_weight = x_weight[:, :, :, None, None]
+        # x_shift: [num_batches, num_segments, num_folds, H, W]
+        x_shift = x_shift * x_weight
+        # x_shift: [num_batches, num_segments, num_folds, H, W]
+        x_shift = x_shift.contiguous().view(n, num_folds, h, w)
+
+        # x_out: [num_batches x num_segments, C, H, W]
+        x_out[:, :num_folds, :] = x_shift
+        x_out[:, num_folds:, :] = x[:, num_folds:, :]
+
+        return x_out
+
+
+@MODELS.register_module()
+class ResNetTIN(ResNetTSM):
+    """ResNet backbone for TIN.
+
+    Args:
+        depth (int): Depth of ResNet, from {18, 34, 50, 101, 152}.
+        num_segments (int): Number of frame segments. Default: 8.
+        is_tin (bool): Whether to apply temporal interlace. Default: True.
+        shift_div (int): Number of division parts for shift. Default: 4.
+        kwargs (dict, optional): Arguments for ResNet.
+    """
+
+    def __init__(self, depth, is_tin=True, **kwargs):
+        self.is_tin = is_tin
+        super().__init__(depth, **kwargs)
+
+    def init_structure(self):
+        if self.is_tin:
+            self.make_temporal_interlace()
+        if len(self.non_local_cfg) != 0:
+            self.make_non_local()
+
+    def _get_wrap_prefix(self):
+        return ['.net2']
+
+    def make_temporal_interlace(self):
+        """Make temporal interlace for some layers."""
+        num_segment_list = [self.num_segments] * 4
+        assert num_segment_list[-1] > 0
+
+        n_round = 1
+        if len(list(self.layer3.children())) >= 23:
+            print(f'=> Using n_round {n_round} to insert temporal shift.')
+
+        def make_block_interlace(stage, num_segments, shift_div):
+            """Apply Deformable shift for a ResNet layer module.
+
+            Args:
+                stage (nn.module): A ResNet layer to be deformed.
+                num_segments (int): Number of frame segments.
+                shift_div (int): Number of division parts for shift.
+
+            Returns:
+                nn.Sequential: A Sequential container consisted of
+                    deformed Interlace blocks.
+            """
+            blocks = list(stage.children())
+            for i, b in enumerate(blocks):
+                if i % n_round == 0:
+                    tds = TemporalInterlace(
+                        b.conv1.in_channels,
+                        num_segments=num_segments,
+                        shift_div=shift_div)
+                    blocks[i].conv1.conv = CombineNet(tds,
+                                                      blocks[i].conv1.conv)
+            return nn.Sequential(*blocks)
+
+        self.layer1 = make_block_interlace(self.layer1, num_segment_list[0],
+                                           self.shift_div)
+        self.layer2 = make_block_interlace(self.layer2, num_segment_list[1],
+                                           self.shift_div)
+        self.layer3 = make_block_interlace(self.layer3, num_segment_list[2],
+                                           self.shift_div)
+        self.layer4 = make_block_interlace(self.layer4, num_segment_list[3],
+                                           self.shift_div)
diff --git a/mmaction/models/backbones/resnet_tsm.py b/mmaction/models/backbones/resnet_tsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f87923e373a2542fc135306ca9faaf476f0ccf4
--- /dev/null
+++ b/mmaction/models/backbones/resnet_tsm.py
@@ -0,0 +1,375 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, NonLocal3d
+from mmengine.logging import MMLogger
+from mmengine.runner.checkpoint import _load_checkpoint
+from torch.nn.modules.utils import _ntuple
+
+from mmaction.registry import MODELS
+from .resnet import ResNet
+
+
+class NL3DWrapper(nn.Module):
+    """3D Non-local wrapper for ResNet50.
+
+    Wrap ResNet layers with 3D NonLocal modules.
+
+    Args:
+        block (nn.Module): Residual blocks to be built.
+        num_segments (int): Number of frame segments.
+        non_local_cfg (dict): Config for non-local layers. Default: ``dict()``.
+    """
+
+    def __init__(self, block, num_segments, non_local_cfg=dict()):
+        super(NL3DWrapper, self).__init__()
+        self.block = block
+        self.non_local_cfg = non_local_cfg
+        self.non_local_block = NonLocal3d(self.block.conv3.norm.num_features,
+                                          **self.non_local_cfg)
+        self.num_segments = num_segments
+
+    def forward(self, x):
+        """Defines the computation performed at every call."""
+        x = self.block(x)
+
+        n, c, h, w = x.size()
+        x = x.view(n // self.num_segments, self.num_segments, c, h,
+                   w).transpose(1, 2).contiguous()
+        x = self.non_local_block(x)
+        x = x.transpose(1, 2).contiguous().view(n, c, h, w)
+        return x
+
+
+class TemporalShift(nn.Module):
+    """Temporal shift module.
+
+    This module is proposed in
+    `TSM: Temporal Shift Module for Efficient Video Understanding
+    <https://arxiv.org/abs/1811.08383>`_
+
+    Args:
+        net (nn.module): Module to make temporal shift.
+        num_segments (int): Number of frame segments. Default: 3.
+        shift_div (int): Number of divisions for shift. Default: 8.
+    """
+
+    def __init__(self, net, num_segments=3, shift_div=8):
+        super().__init__()
+        self.net = net
+        self.num_segments = num_segments
+        self.shift_div = shift_div
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        x = self.shift(x, self.num_segments, shift_div=self.shift_div)
+        return self.net(x)
+
+    @staticmethod
+    def shift(x, num_segments, shift_div=3):
+        """Perform temporal shift operation on the feature.
+
+        Args:
+            x (torch.Tensor): The input feature to be shifted.
+            num_segments (int): Number of frame segments.
+            shift_div (int): Number of divisions for shift. Default: 3.
+
+        Returns:
+            torch.Tensor: The shifted feature.
+        """
+        # [N, C, H, W]
+        n, c, h, w = x.size()
+
+        # [N // num_segments, num_segments, C, H*W]
+        # can't use 5 dimensional array on PPL2D backend for caffe
+        x = x.view(-1, num_segments, c, h * w)
+
+        # get shift fold
+        fold = c // shift_div
+
+        # split c channel into three parts:
+        # left_split, mid_split, right_split
+        left_split = x[:, :, :fold, :]
+        mid_split = x[:, :, fold:2 * fold, :]
+        right_split = x[:, :, 2 * fold:, :]
+
+        # can't use torch.zeros(*A.shape) or torch.zeros_like(A)
+        # because array on caffe inference must be got by computing
+
+        # shift left on num_segments channel in `left_split`
+        zeros = left_split - left_split
+        blank = zeros[:, :1, :, :]
+        left_split = left_split[:, 1:, :, :]
+        left_split = torch.cat((left_split, blank), 1)
+
+        # shift right on num_segments channel in `mid_split`
+        zeros = mid_split - mid_split
+        blank = zeros[:, :1, :, :]
+        mid_split = mid_split[:, :-1, :, :]
+        mid_split = torch.cat((blank, mid_split), 1)
+
+        # right_split: no shift
+
+        # concatenate
+        out = torch.cat((left_split, mid_split, right_split), 2)
+
+        # [N, C, H, W]
+        # restore the original dimension
+        return out.view(n, c, h, w)
+
+
+@MODELS.register_module()
+class ResNetTSM(ResNet):
+    """ResNet backbone for TSM.
+
+    Args:
+        num_segments (int): Number of frame segments. Defaults to 8.
+        is_shift (bool): Whether to make temporal shift in reset layers.
+            Defaults to True.
+        non_local (Sequence[int]): Determine whether to apply non-local module
+            in the corresponding block of each stages.
+            Defaults to (0, 0, 0, 0).
+        non_local_cfg (dict): Config for non-local module.
+            Defaults to ``dict()``.
+        shift_div (int): Number of div for shift. Defaults to 8.
+        shift_place (str): Places in resnet layers for shift, which is chosen
+            from ['block', 'blockres'].
+            If set to 'block', it will apply temporal shift to all child blocks
+            in each resnet layer.
+            If set to 'blockres', it will apply temporal shift to each `conv1`
+            layer of all child blocks in each resnet layer.
+            Defaults to 'blockres'.
+        temporal_pool (bool): Whether to add temporal pooling.
+            Defaults to False.
+        pretrained2d (bool): Whether to load pretrained 2D model.
+            Defaults to True.
+        **kwargs (keyword arguments, optional): Arguments for ResNet.
+    """
+
+    def __init__(self,
+                 depth,
+                 num_segments=8,
+                 is_shift=True,
+                 non_local=(0, 0, 0, 0),
+                 non_local_cfg=dict(),
+                 shift_div=8,
+                 shift_place='blockres',
+                 temporal_pool=False,
+                 pretrained2d=True,
+                 **kwargs):
+        super().__init__(depth, **kwargs)
+        self.num_segments = num_segments
+        self.is_shift = is_shift
+        self.shift_div = shift_div
+        self.shift_place = shift_place
+        self.temporal_pool = temporal_pool
+        self.non_local = non_local
+        self.non_local_stages = _ntuple(self.num_stages)(non_local)
+        self.non_local_cfg = non_local_cfg
+        self.pretrained2d = pretrained2d
+        self.init_structure()
+
+    def init_structure(self):
+        """Initialize structure for tsm."""
+        if self.is_shift:
+            self.make_temporal_shift()
+        if len(self.non_local_cfg) != 0:
+            self.make_non_local()
+        if self.temporal_pool:
+            self.make_temporal_pool()
+
+    def make_temporal_shift(self):
+        """Make temporal shift for some layers."""
+        if self.temporal_pool:
+            num_segment_list = [
+                self.num_segments, self.num_segments // 2,
+                self.num_segments // 2, self.num_segments // 2
+            ]
+        else:
+            num_segment_list = [self.num_segments] * 4
+        if num_segment_list[-1] <= 0:
+            raise ValueError('num_segment_list[-1] must be positive')
+
+        if self.shift_place == 'block':
+
+            def make_block_temporal(stage, num_segments):
+                """Make temporal shift on some blocks.
+
+                Args:
+                    stage (nn.Module): Model layers to be shifted.
+                    num_segments (int): Number of frame segments.
+
+                Returns:
+                    nn.Module: The shifted blocks.
+                """
+                blocks = list(stage.children())
+                for i, b in enumerate(blocks):
+                    blocks[i] = TemporalShift(
+                        b, num_segments=num_segments, shift_div=self.shift_div)
+                return nn.Sequential(*blocks)
+
+            self.layer1 = make_block_temporal(self.layer1, num_segment_list[0])
+            self.layer2 = make_block_temporal(self.layer2, num_segment_list[1])
+            self.layer3 = make_block_temporal(self.layer3, num_segment_list[2])
+            self.layer4 = make_block_temporal(self.layer4, num_segment_list[3])
+
+        elif 'blockres' in self.shift_place:
+            n_round = 1
+            if len(list(self.layer3.children())) >= 23:
+                n_round = 2
+
+            def make_block_temporal(stage, num_segments):
+                """Make temporal shift on some blocks.
+
+                Args:
+                    stage (nn.Module): Model layers to be shifted.
+                    num_segments (int): Number of frame segments.
+
+                Returns:
+                    nn.Module: The shifted blocks.
+                """
+                blocks = list(stage.children())
+                for i, b in enumerate(blocks):
+                    if i % n_round == 0:
+                        blocks[i].conv1.conv = TemporalShift(
+                            b.conv1.conv,
+                            num_segments=num_segments,
+                            shift_div=self.shift_div)
+                return nn.Sequential(*blocks)
+
+            self.layer1 = make_block_temporal(self.layer1, num_segment_list[0])
+            self.layer2 = make_block_temporal(self.layer2, num_segment_list[1])
+            self.layer3 = make_block_temporal(self.layer3, num_segment_list[2])
+            self.layer4 = make_block_temporal(self.layer4, num_segment_list[3])
+
+        else:
+            raise NotImplementedError
+
+    def make_temporal_pool(self):
+        """Make temporal pooling between layer1 and layer2, using a 3D max
+        pooling layer."""
+
+        class TemporalPool(nn.Module):
+            """Temporal pool module.
+
+            Wrap layer2 in ResNet50 with a 3D max pooling layer.
+
+            Args:
+                net (nn.Module): Module to make temporal pool.
+                num_segments (int): Number of frame segments.
+            """
+
+            def __init__(self, net, num_segments):
+                super().__init__()
+                self.net = net
+                self.num_segments = num_segments
+                self.max_pool3d = nn.MaxPool3d(
+                    kernel_size=(3, 1, 1), stride=(2, 1, 1), padding=(1, 0, 0))
+
+            def forward(self, x):
+                """Defines the computation performed at every call."""
+                # [N, C, H, W]
+                n, c, h, w = x.size()
+                # [N // num_segments, C, num_segments, H, W]
+                x = x.view(n // self.num_segments, self.num_segments, c, h,
+                           w).transpose(1, 2)
+                # [N // num_segmnets, C, num_segments // 2, H, W]
+                x = self.max_pool3d(x)
+                # [N // 2, C, H, W]
+                x = x.transpose(1, 2).contiguous().view(n // 2, c, h, w)
+                return self.net(x)
+
+        self.layer2 = TemporalPool(self.layer2, self.num_segments)
+
+    def make_non_local(self):
+        """Wrap resnet layer into non local wrapper."""
+        # This part is for ResNet50
+        for i in range(self.num_stages):
+            non_local_stage = self.non_local_stages[i]
+            if sum(non_local_stage) == 0:
+                continue
+
+            layer_name = f'layer{i + 1}'
+            res_layer = getattr(self, layer_name)
+
+            for idx, non_local in enumerate(non_local_stage):
+                if non_local:
+                    res_layer[idx] = NL3DWrapper(res_layer[idx],
+                                                 self.num_segments,
+                                                 self.non_local_cfg)
+
+    def _get_wrap_prefix(self):
+        return ['.net', '.block']
+
+    def load_original_weights(self, logger):
+        """Load weights from original checkpoint, which required converting
+        keys."""
+        state_dict_torchvision = _load_checkpoint(
+            self.pretrained, map_location='cpu')
+        if 'state_dict' in state_dict_torchvision:
+            state_dict_torchvision = state_dict_torchvision['state_dict']
+
+        wrapped_layers_map = dict()
+        for name, module in self.named_modules():
+            # convert torchvision keys
+            ori_name = name
+            for wrap_prefix in self._get_wrap_prefix():
+                if wrap_prefix in ori_name:
+                    ori_name = ori_name.replace(wrap_prefix, '')
+                    wrapped_layers_map[ori_name] = name
+
+            if isinstance(module, ConvModule):
+                if 'downsample' in ori_name:
+                    # layer{X}.{Y}.downsample.conv->layer{X}.{Y}.downsample.0
+                    tv_conv_name = ori_name + '.0'
+                    # layer{X}.{Y}.downsample.bn->layer{X}.{Y}.downsample.1
+                    tv_bn_name = ori_name + '.1'
+                else:
+                    # layer{X}.{Y}.conv{n}.conv->layer{X}.{Y}.conv{n}
+                    tv_conv_name = ori_name
+                    # layer{X}.{Y}.conv{n}.bn->layer{X}.{Y}.bn{n}
+                    tv_bn_name = ori_name.replace('conv', 'bn')
+
+                for conv_param in ['.weight', '.bias']:
+                    if tv_conv_name + conv_param in state_dict_torchvision:
+                        state_dict_torchvision[ori_name+'.conv'+conv_param] = \
+                            state_dict_torchvision.pop(tv_conv_name+conv_param)
+
+                for bn_param in [
+                        '.weight', '.bias', '.running_mean', '.running_var'
+                ]:
+                    if tv_bn_name + bn_param in state_dict_torchvision:
+                        state_dict_torchvision[ori_name+'.bn'+bn_param] = \
+                            state_dict_torchvision.pop(tv_bn_name+bn_param)
+
+        # convert wrapped keys
+        for param_name in list(state_dict_torchvision.keys()):
+            layer_name = '.'.join(param_name.split('.')[:-1])
+            if layer_name in wrapped_layers_map:
+                wrapped_name = param_name.replace(
+                    layer_name, wrapped_layers_map[layer_name])
+                print(f'wrapped_name {wrapped_name}')
+                state_dict_torchvision[
+                    wrapped_name] = state_dict_torchvision.pop(param_name)
+
+        msg = self.load_state_dict(state_dict_torchvision, strict=False)
+        logger.info(msg)
+
+    def init_weights(self):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if self.pretrained2d:
+            logger = MMLogger.get_current_instance()
+            self.load_original_weights(logger)
+        else:
+            if self.pretrained:
+                self.init_cfg = dict(
+                    type='Pretrained', checkpoint=self.pretrained)
+            super().init_weights()
diff --git a/mmaction/models/backbones/rgbposeconv3d.py b/mmaction/models/backbones/rgbposeconv3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c4bdd9b89c746579784f4d5df4d7870da71a4bc
--- /dev/null
+++ b/mmaction/models/backbones/rgbposeconv3d.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmengine.logging import MMLogger, print_log
+from mmengine.model import BaseModule
+from mmengine.model.weight_init import constant_init, kaiming_init
+from mmengine.runner.checkpoint import load_checkpoint
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.registry import MODELS
+from .resnet3d_slowfast import ResNet3dPathway
+
+
+@MODELS.register_module()
+class RGBPoseConv3D(BaseModule):
+    """RGBPoseConv3D backbone.
+
+    Args:
+        pretrained (str): The file path to a pretrained model.
+            Defaults to None.
+        speed_ratio (int): Speed ratio indicating the ratio between time
+            dimension of the fast and slow pathway, corresponding to the
+            :math:`\\alpha` in the paper. Defaults to 4.
+        channel_ratio (int): Reduce the channel number of fast pathway
+            by ``channel_ratio``, corresponding to :math:`\\beta` in the paper.
+            Defaults to 4.
+        rgb_detach (bool): Whether to detach the gradients from the pose path.
+            Defaults to False.
+        pose_detach (bool): Whether to detach the gradients from the rgb path.
+            Defaults to False.
+        rgb_drop_path (float): The drop rate for dropping the features from
+            the pose path. Defaults to 0.
+        pose_drop_path (float): The drop rate for dropping the features from
+            the rgb path. Defaults to 0.
+        rgb_pathway (dict): Configuration of rgb branch. Defaults to
+            ``dict(num_stages=4, lateral=True, lateral_infl=1,
+            lateral_activate=(0, 0, 1, 1), fusion_kernel=7, base_channels=64,
+            conv1_kernel=(1, 7, 7), inflate=(0, 0, 1, 1), with_pool2=False)``.
+        pose_pathway (dict): Configuration of pose branch. Defaults to
+            ``dict(num_stages=3, stage_blocks=(4, 6, 3), lateral=True,
+            lateral_inv=True, lateral_infl=16, lateral_activate=(0, 1, 1),
+            fusion_kernel=7, in_channels=17, base_channels=32,
+            out_indices=(2, ), conv1_kernel=(1, 7, 7), conv1_stride_s=1,
+            conv1_stride_t=1, pool1_stride_s=1, pool1_stride_t=1,
+            inflate=(0, 1, 1), spatial_strides=(2, 2, 2),
+            temporal_strides=(1, 1, 1), with_pool2=False)``.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 speed_ratio: int = 4,
+                 channel_ratio: int = 4,
+                 rgb_detach: bool = False,
+                 pose_detach: bool = False,
+                 rgb_drop_path: float = 0,
+                 pose_drop_path: float = 0,
+                 rgb_pathway: Dict = dict(
+                     num_stages=4,
+                     lateral=True,
+                     lateral_infl=1,
+                     lateral_activate=(0, 0, 1, 1),
+                     fusion_kernel=7,
+                     base_channels=64,
+                     conv1_kernel=(1, 7, 7),
+                     inflate=(0, 0, 1, 1),
+                     with_pool2=False),
+                 pose_pathway: Dict = dict(
+                     num_stages=3,
+                     stage_blocks=(4, 6, 3),
+                     lateral=True,
+                     lateral_inv=True,
+                     lateral_infl=16,
+                     lateral_activate=(0, 1, 1),
+                     fusion_kernel=7,
+                     in_channels=17,
+                     base_channels=32,
+                     out_indices=(2, ),
+                     conv1_kernel=(1, 7, 7),
+                     conv1_stride_s=1,
+                     conv1_stride_t=1,
+                     pool1_stride_s=1,
+                     pool1_stride_t=1,
+                     inflate=(0, 1, 1),
+                     spatial_strides=(2, 2, 2),
+                     temporal_strides=(1, 1, 1),
+                     dilations=(1, 1, 1),
+                     with_pool2=False),
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.pretrained = pretrained
+        self.speed_ratio = speed_ratio
+        self.channel_ratio = channel_ratio
+
+        if rgb_pathway['lateral']:
+            rgb_pathway['speed_ratio'] = speed_ratio
+            rgb_pathway['channel_ratio'] = channel_ratio
+
+        if pose_pathway['lateral']:
+            pose_pathway['speed_ratio'] = speed_ratio
+            pose_pathway['channel_ratio'] = channel_ratio
+
+        self.rgb_path = ResNet3dPathway(**rgb_pathway)
+        self.pose_path = ResNet3dPathway(**pose_pathway)
+        self.rgb_detach = rgb_detach
+        self.pose_detach = pose_detach
+        assert 0 <= rgb_drop_path <= 1
+        assert 0 <= pose_drop_path <= 1
+        self.rgb_drop_path = rgb_drop_path
+        self.pose_drop_path = pose_drop_path
+
+    def init_weights(self) -> None:
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                kaiming_init(m)
+            elif isinstance(m, _BatchNorm):
+                constant_init(m, 1)
+
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            msg = f'load model from: {self.pretrained}'
+            print_log(msg, logger=logger)
+            load_checkpoint(self, self.pretrained, strict=True, logger=logger)
+        elif self.pretrained is None:
+            # Init two branch separately.
+            self.rgb_path.init_weights()
+            self.pose_path.init_weights()
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, imgs: torch.Tensor, heatmap_imgs: torch.Tensor) -> tuple:
+        """Defines the computation performed at every call.
+
+        Args:
+            imgs (torch.Tensor): The input data.
+            heatmap_imgs (torch.Tensor): The input data.
+
+        Returns:
+            tuple[torch.Tensor]: The feature of the input
+            samples extracted by the backbone.
+        """
+        if self.training:
+            rgb_drop_path = torch.rand(1) < self.rgb_drop_path
+            pose_drop_path = torch.rand(1) < self.pose_drop_path
+        else:
+            rgb_drop_path, pose_drop_path = False, False
+        # We assume base_channel for RGB and Pose are 64 and 32.
+        x_rgb = self.rgb_path.conv1(imgs)
+        x_rgb = self.rgb_path.maxpool(x_rgb)
+        # N x 64 x 8 x 56 x 56
+        x_pose = self.pose_path.conv1(heatmap_imgs)
+        x_pose = self.pose_path.maxpool(x_pose)
+
+        x_rgb = self.rgb_path.layer1(x_rgb)
+        x_rgb = self.rgb_path.layer2(x_rgb)
+        x_pose = self.pose_path.layer1(x_pose)
+
+        if hasattr(self.rgb_path, 'layer2_lateral'):
+            feat = x_pose.detach() if self.rgb_detach else x_pose
+            x_pose_lateral = self.rgb_path.layer2_lateral(feat)
+            if rgb_drop_path:
+                x_pose_lateral = x_pose_lateral.new_zeros(x_pose_lateral.shape)
+
+        if hasattr(self.pose_path, 'layer1_lateral'):
+            feat = x_rgb.detach() if self.pose_detach else x_rgb
+            x_rgb_lateral = self.pose_path.layer1_lateral(feat)
+            if pose_drop_path:
+                x_rgb_lateral = x_rgb_lateral.new_zeros(x_rgb_lateral.shape)
+
+        if hasattr(self.rgb_path, 'layer2_lateral'):
+            x_rgb = torch.cat((x_rgb, x_pose_lateral), dim=1)
+
+        if hasattr(self.pose_path, 'layer1_lateral'):
+            x_pose = torch.cat((x_pose, x_rgb_lateral), dim=1)
+
+        x_rgb = self.rgb_path.layer3(x_rgb)
+        x_pose = self.pose_path.layer2(x_pose)
+
+        if hasattr(self.rgb_path, 'layer3_lateral'):
+            feat = x_pose.detach() if self.rgb_detach else x_pose
+            x_pose_lateral = self.rgb_path.layer3_lateral(feat)
+            if rgb_drop_path:
+                x_pose_lateral = x_pose_lateral.new_zeros(x_pose_lateral.shape)
+
+        if hasattr(self.pose_path, 'layer2_lateral'):
+            feat = x_rgb.detach() if self.pose_detach else x_rgb
+            x_rgb_lateral = self.pose_path.layer2_lateral(feat)
+            if pose_drop_path:
+                x_rgb_lateral = x_rgb_lateral.new_zeros(x_rgb_lateral.shape)
+
+        if hasattr(self.rgb_path, 'layer3_lateral'):
+            x_rgb = torch.cat((x_rgb, x_pose_lateral), dim=1)
+
+        if hasattr(self.pose_path, 'layer2_lateral'):
+            x_pose = torch.cat((x_pose, x_rgb_lateral), dim=1)
+
+        x_rgb = self.rgb_path.layer4(x_rgb)
+        x_pose = self.pose_path.layer3(x_pose)
+
+        return x_rgb, x_pose
diff --git a/mmaction/models/backbones/stgcn.py b/mmaction/models/backbones/stgcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9011a6262407305d4454c0b3517be11fdddce9f3
--- /dev/null
+++ b/mmaction/models/backbones/stgcn.py
@@ -0,0 +1,238 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule, ModuleList
+
+from mmaction.registry import MODELS
+from ..utils import Graph, mstcn, unit_gcn, unit_tcn
+
+EPS = 1e-4
+
+
+class STGCNBlock(BaseModule):
+    """The basic block of STGCN.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        A (torch.Tensor): The adjacency matrix defined in the graph
+            with shape of `(num_subsets, num_nodes, num_nodes)`.
+        stride (int): Stride of the temporal convolution. Defaults to 1.
+        residual (bool): Whether to use residual connection. Defaults to True.
+        init_cfg (dict or list[dict], optional): Config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 A: torch.Tensor,
+                 stride: int = 1,
+                 residual: bool = True,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        gcn_kwargs = {k[4:]: v for k, v in kwargs.items() if k[:4] == 'gcn_'}
+        tcn_kwargs = {k[4:]: v for k, v in kwargs.items() if k[:4] == 'tcn_'}
+        kwargs = {
+            k: v
+            for k, v in kwargs.items() if k[:4] not in ['gcn_', 'tcn_']
+        }
+        assert len(kwargs) == 0, f'Invalid arguments: {kwargs}'
+
+        tcn_type = tcn_kwargs.pop('type', 'unit_tcn')
+        assert tcn_type in ['unit_tcn', 'mstcn']
+        gcn_type = gcn_kwargs.pop('type', 'unit_gcn')
+        assert gcn_type in ['unit_gcn']
+
+        self.gcn = unit_gcn(in_channels, out_channels, A, **gcn_kwargs)
+
+        if tcn_type == 'unit_tcn':
+            self.tcn = unit_tcn(
+                out_channels, out_channels, 9, stride=stride, **tcn_kwargs)
+        elif tcn_type == 'mstcn':
+            self.tcn = mstcn(
+                out_channels, out_channels, stride=stride, **tcn_kwargs)
+        self.relu = nn.ReLU()
+
+        if not residual:
+            self.residual = lambda x: 0
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+        else:
+            self.residual = unit_tcn(
+                in_channels, out_channels, kernel_size=1, stride=stride)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        res = self.residual(x)
+        x = self.tcn(self.gcn(x)) + res
+        return self.relu(x)
+
+
+@MODELS.register_module()
+class STGCN(BaseModule):
+    """STGCN backbone.
+
+    Spatial Temporal Graph Convolutional
+    Networks for Skeleton-Based Action Recognition.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1801.07455>`__ .
+
+    Args:
+        graph_cfg (dict): Config for building the graph.
+        in_channels (int): Number of input channels. Defaults to 3.
+        base_channels (int): Number of base channels. Defaults to 64.
+        data_bn_type (str): Type of the data bn layer. Defaults to ``'VC'``.
+        ch_ratio (int): Inflation ratio of the number of channels.
+            Defaults to 2.
+        num_person (int): Maximum number of people. Only used when
+            data_bn_type == 'MVC'. Defaults to 2.
+        num_stages (int): Total number of stages. Defaults to 10.
+        inflate_stages (list[int]): Stages to inflate the number of channels.
+            Defaults to ``[5, 8]``.
+        down_stages (list[int]): Stages to perform downsampling in
+            the time dimension. Defaults to ``[5, 8]``.
+        stage_cfgs (dict): Extra config dict for each stage.
+            Defaults to ``dict()``.
+        init_cfg (dict or list[dict], optional): Config to control
+            the initialization. Defaults to None.
+
+        Examples:
+        >>> import torch
+        >>> from mmaction.models import STGCN
+        >>>
+        >>> mode = 'stgcn_spatial'
+        >>> batch_size, num_person, num_frames = 2, 2, 150
+        >>>
+        >>> # openpose-18 layout
+        >>> num_joints = 18
+        >>> model = STGCN(graph_cfg=dict(layout='openpose', mode=mode))
+        >>> model.init_weights()
+        >>> inputs = torch.randn(batch_size, num_person,
+        ...                      num_frames, num_joints, 3)
+        >>> output = model(inputs)
+        >>> print(output.shape)
+        >>>
+        >>> # nturgb+d layout
+        >>> num_joints = 25
+        >>> model = STGCN(graph_cfg=dict(layout='nturgb+d', mode=mode))
+        >>> model.init_weights()
+        >>> inputs = torch.randn(batch_size, num_person,
+        ...                      num_frames, num_joints, 3)
+        >>> output = model(inputs)
+        >>> print(output.shape)
+        >>>
+        >>> # coco layout
+        >>> num_joints = 17
+        >>> model = STGCN(graph_cfg=dict(layout='coco', mode=mode))
+        >>> model.init_weights()
+        >>> inputs = torch.randn(batch_size, num_person,
+        ...                      num_frames, num_joints, 3)
+        >>> output = model(inputs)
+        >>> print(output.shape)
+        >>>
+        >>> # custom settings
+        >>> # instantiate STGCN++
+        >>> model = STGCN(graph_cfg=dict(layout='coco', mode='spatial'),
+        ...               gcn_adaptive='init', gcn_with_res=True,
+        ...               tcn_type='mstcn')
+        >>> model.init_weights()
+        >>> output = model(inputs)
+        >>> print(output.shape)
+        torch.Size([2, 2, 256, 38, 18])
+        torch.Size([2, 2, 256, 38, 25])
+        torch.Size([2, 2, 256, 38, 17])
+        torch.Size([2, 2, 256, 38, 17])
+    """
+
+    def __init__(self,
+                 graph_cfg: Dict,
+                 in_channels: int = 3,
+                 base_channels: int = 64,
+                 data_bn_type: str = 'VC',
+                 ch_ratio: int = 2,
+                 num_person: int = 2,
+                 num_stages: int = 10,
+                 inflate_stages: List[int] = [5, 8],
+                 down_stages: List[int] = [5, 8],
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.graph = Graph(**graph_cfg)
+        A = torch.tensor(
+            self.graph.A, dtype=torch.float32, requires_grad=False)
+        self.data_bn_type = data_bn_type
+
+        if data_bn_type == 'MVC':
+            self.data_bn = nn.BatchNorm1d(num_person * in_channels * A.size(1))
+        elif data_bn_type == 'VC':
+            self.data_bn = nn.BatchNorm1d(in_channels * A.size(1))
+        else:
+            self.data_bn = nn.Identity()
+
+        lw_kwargs = [cp.deepcopy(kwargs) for i in range(num_stages)]
+        for k, v in kwargs.items():
+            if isinstance(v, (tuple, list)) and len(v) == num_stages:
+                for i in range(num_stages):
+                    lw_kwargs[i][k] = v[i]
+        lw_kwargs[0].pop('tcn_dropout', None)
+
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+        self.ch_ratio = ch_ratio
+        self.inflate_stages = inflate_stages
+        self.down_stages = down_stages
+
+        modules = []
+        if self.in_channels != self.base_channels:
+            modules = [
+                STGCNBlock(
+                    in_channels,
+                    base_channels,
+                    A.clone(),
+                    1,
+                    residual=False,
+                    **lw_kwargs[0])
+            ]
+
+        inflate_times = 0
+        for i in range(2, num_stages + 1):
+            stride = 1 + (i in down_stages)
+            in_channels = base_channels
+            if i in inflate_stages:
+                inflate_times += 1
+            out_channels = int(self.base_channels *
+                               self.ch_ratio**inflate_times + EPS)
+            base_channels = out_channels
+            modules.append(
+                STGCNBlock(in_channels, out_channels, A.clone(), stride,
+                           **lw_kwargs[i - 1]))
+
+        if self.in_channels == self.base_channels:
+            num_stages -= 1
+
+        self.num_stages = num_stages
+        self.gcn = ModuleList(modules)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        N, M, T, V, C = x.size()
+        x = x.permute(0, 1, 3, 4, 2).contiguous()
+        if self.data_bn_type == 'MVC':
+            x = self.data_bn(x.view(N, M * V * C, T))
+        else:
+            x = self.data_bn(x.view(N * M, V * C, T))
+        x = x.view(N, M, V, C, T).permute(0, 1, 3, 4,
+                                          2).contiguous().view(N * M, C, T, V)
+
+        for i in range(self.num_stages):
+            x = self.gcn[i](x)
+
+        x = x.reshape((N, M) + x.shape[1:])
+        return x
diff --git a/mmaction/models/backbones/swin.py b/mmaction/models/backbones/swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bf57cbc41b5bc25c56dbde6b0ddde1b3d2702ed
--- /dev/null
+++ b/mmaction/models/backbones/swin.py
@@ -0,0 +1,1022 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import lru_cache, reduce
+from operator import mul
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from einops import rearrange
+from mmcv.cnn import build_activation_layer, build_conv_layer, build_norm_layer
+from mmcv.cnn.bricks import DropPath
+from mmengine.logging import MMLogger
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import trunc_normal_
+from mmengine.runner.checkpoint import _load_checkpoint
+
+from mmaction.registry import MODELS
+
+
+def window_partition(x: torch.Tensor,
+                     window_size: Sequence[int]) -> torch.Tensor:
+    """
+    Args:
+        x (torch.Tensor): The input features of shape :math:`(B, D, H, W, C)`.
+        window_size (Sequence[int]): The window size, :math:`(w_d, w_h, w_w)`.
+
+    Returns:
+        torch.Tensor: The partitioned windows of shape
+            :math:`(B*num_windows, w_d*w_h*w_w, C)`.
+    """
+    B, D, H, W, C = x.shape
+    x = x.view(B, D // window_size[0], window_size[0], H // window_size[1],
+               window_size[1], W // window_size[2], window_size[2], C)
+    windows = x.permute(0, 1, 3, 5, 2, 4, 6,
+                        7).contiguous().view(-1, reduce(mul, window_size), C)
+    return windows
+
+
+def window_reverse(windows: torch.Tensor, window_size: Sequence[int], B: int,
+                   D: int, H: int, W: int) -> torch.Tensor:
+    """
+    Args:
+        windows (torch.Tensor): Input windows of shape
+            :meth:`(B*num_windows, w_d, w_h, w_w, C)`.
+        window_size (Sequence[int]): The window size, :meth:`(w_d, w_h, w_w)`.
+        B (int): Batch size of feature maps.
+        D (int): Temporal length of feature maps.
+        H (int): Height of feature maps.
+        W (int): Width of feature maps.
+
+    Returns:
+        torch.Tensor: The feature maps reversed from windows of
+            shape :math:`(B, D, H, W, C)`.
+    """
+    x = windows.view(B, D // window_size[0], H // window_size[1],
+                     W // window_size[2], window_size[0], window_size[1],
+                     window_size[2], -1)
+    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).contiguous().view(B, D, H, W, -1)
+    return x
+
+
+def get_window_size(
+    x_size: Sequence[int],
+    window_size: Sequence[int],
+    shift_size: Optional[Sequence[int]] = None
+) -> Union[Tuple[int], Tuple[Tuple[int]]]:
+    """Calculate window size and shift size according to the input size.
+
+    Args:
+        x_size (Sequence[int]): The input size.
+        window_size (Sequence[int]): The expected window size.
+        shift_size (Sequence[int], optional): The expected shift size.
+            Defaults to None.
+
+    Returns:
+        tuple: The calculated window size and shift size.
+    """
+    use_window_size = list(window_size)
+    if shift_size is not None:
+        use_shift_size = list(shift_size)
+    for i in range(len(x_size)):
+        if x_size[i] <= window_size[i]:
+            use_window_size[i] = x_size[i]
+            if shift_size is not None:
+                use_shift_size[i] = 0
+
+    if shift_size is None:
+        return tuple(use_window_size)
+    else:
+        return tuple(use_window_size), tuple(use_shift_size)
+
+
+# cache each stage results
+@lru_cache()
+def compute_mask(D: int, H: int, W: int, window_size: Sequence[int],
+                 shift_size: Sequence[int],
+                 device: Union[str, torch.device]) -> torch.Tensor:
+    """Compute attention mask.
+
+    Args:
+        D (int): Temporal length of feature maps.
+        H (int): Height of feature maps.
+        W (int): Width of feature maps.
+        window_size (Sequence[int]): The window size.
+        shift_size (Sequence[int]): The shift size.
+        device (str or :obj:`torch.device`): The device of the mask.
+
+    Returns:
+        torch.Tensor: The attention mask used for shifted window attention.
+    """
+    img_mask = torch.zeros((1, D, H, W, 1), device=device)  # 1 Dp Hp Wp 1
+    cnt = 0
+    for d in slice(-window_size[0]), slice(-window_size[0],
+                                           -shift_size[0]), slice(
+                                               -shift_size[0], None):
+        for h in slice(-window_size[1]), slice(-window_size[1],
+                                               -shift_size[1]), slice(
+                                                   -shift_size[1], None):
+            for w in slice(-window_size[2]), slice(-window_size[2],
+                                                   -shift_size[2]), slice(
+                                                       -shift_size[2], None):
+                img_mask[:, d, h, w, :] = cnt
+                cnt += 1
+
+    mask_windows = window_partition(img_mask,
+                                    window_size)  # nW, ws[0]*ws[1]*ws[2], 1
+    mask_windows = mask_windows.squeeze(-1)  # nW, ws[0]*ws[1]*ws[2]
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                      float(-100.0)).masked_fill(
+                                          attn_mask == 0, float(0.0))
+    return attn_mask
+
+
+class WindowAttention3D(BaseModule):
+    """Window based multi-head self attention (W-MSA) module with relative
+    position bias. It supports both of shifted and non-shifted window.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        window_size (Sequence[int]): The temporal length, height and
+            width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool):  If True, add a learnable bias to query,
+            key, value. Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        attn_drop (float): Dropout ratio of attention weight. Defaults to 0.0.
+        proj_drop (float): Dropout ratio of output. Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 window_size: Sequence[int],
+                 num_heads: int,
+                 qkv_bias: bool = True,
+                 qk_scale: Optional[float] = None,
+                 attn_drop: float = 0.,
+                 proj_drop: float = 0.,
+                 init_cfg: Optional[Dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wd, Wh, Ww
+        self.num_heads = num_heads
+        head_dim = embed_dims // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        # # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1) *
+                        (2 * window_size[2] - 1), num_heads))
+
+        # get pair-wise relative position index for
+        # each token inside the window
+        coords_d = torch.arange(self.window_size[0])
+        coords_h = torch.arange(self.window_size[1])
+        coords_w = torch.arange(self.window_size[2])
+        coords = torch.stack(torch.meshgrid(
+            coords_d,
+            coords_h,
+            coords_w,
+        ))  # 3, Wd, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
+        # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        relative_coords = \
+            coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        # shift to start from 0
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords[:, :, 0] *= (2 * self.window_size[1] - 1) * \
+                                    (2 * self.window_size[2] - 1)
+        relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1)
+        relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self,
+                x: torch.Tensor,
+                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): Input feature maps of shape
+                :meth:`(B*num_windows, N, C)`.
+            mask (torch.Tensor, optional): (0/-inf) mask of shape
+                :meth:`(num_windows, N, N)`. Defaults to None.
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # B_, nH, N, C
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index[:N, :N].reshape(-1)].reshape(
+                N, N, -1)  # Wd*Wh*Ww,Wd*Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wd*Wh*Ww, Wd*Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)  # B_, nH, N, N
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Mlp(BaseModule):
+    """Multilayer perceptron.
+
+    Args:
+        in_features (int): Number of input features.
+        hidden_features (int, optional): Number of hidden features.
+            Defaults to None.
+        out_features (int, optional): Number of output features.
+            Defaults to None.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='GELU')``.
+        drop (float): Dropout rate. Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_features: int,
+                 hidden_features: Optional[int] = None,
+                 out_features: Optional[int] = None,
+                 act_cfg: Dict = dict(type='GELU'),
+                 drop: float = 0.,
+                 init_cfg: Optional[Dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = build_activation_layer(act_cfg)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward function."""
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class SwinTransformerBlock3D(BaseModule):
+    """Swin Transformer Block.
+
+    Args:
+        embed_dims (int): Number of feature channels.
+        num_heads (int): Number of attention heads.
+        window_size (Sequence[int]): Window size. Defaults to ``(8, 7, 7)``.
+        shift_size (Sequence[int]): Shift size for SW-MSA or W-MSA.
+            Defaults to ``(0, 0, 0)``.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            Defaults to 4.0.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        drop (float): Dropout rate. Defaults to 0.0.
+        attn_drop (float): Attention dropout rate. Defaults to 0.0.
+        drop_path (float): Stochastic depth rate. Defaults to 0.1.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='GELU')``.
+        norm_cfg (dict): Config dict for norm layer.
+            Defaults to ``dict(type='LN')``.
+        with_cp (bool): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Defaults to False.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 num_heads: int,
+                 window_size: Sequence[int] = (8, 7, 7),
+                 shift_size: Sequence[int] = (0, 0, 0),
+                 mlp_ratio: float = 4.,
+                 qkv_bias: bool = True,
+                 qk_scale: Optional[float] = None,
+                 drop: float = 0.,
+                 attn_drop: float = 0.,
+                 drop_path: float = 0.1,
+                 act_cfg: Dict = dict(type='GELU'),
+                 norm_cfg: Dict = dict(type='LN'),
+                 with_cp: bool = False,
+                 init_cfg: Optional[Dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.with_cp = with_cp
+
+        assert 0 <= self.shift_size[0] < self.window_size[
+            0], 'shift_size[0] must in [0, window_size[0])'
+        assert 0 <= self.shift_size[1] < self.window_size[
+            1], 'shift_size[1] must in [0, window_size[0])'
+        assert 0 <= self.shift_size[2] < self.window_size[
+            2], 'shift_size[2] must in [0, window_size[0])'
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        _attn_cfg = {
+            'embed_dims': embed_dims,
+            'window_size': window_size,
+            'num_heads': num_heads,
+            'qkv_bias': qkv_bias,
+            'qk_scale': qk_scale,
+            'attn_drop': attn_drop,
+            'proj_drop': drop
+        }
+        self.attn = WindowAttention3D(**_attn_cfg)
+
+        self.drop_path = DropPath(drop_path) \
+            if drop_path > 0. else nn.Identity()
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        _mlp_cfg = {
+            'in_features': embed_dims,
+            'hidden_features': int(embed_dims * mlp_ratio),
+            'act_cfg': act_cfg,
+            'drop': drop
+        }
+        self.mlp = Mlp(**_mlp_cfg)
+
+    def forward_part1(self, x: torch.Tensor,
+                      mask_matrix: torch.Tensor) -> torch.Tensor:
+        """Forward function part1."""
+        B, D, H, W, C = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+
+        x = self.norm1(x)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = pad_d0 = 0
+        pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0]
+        pad_b = (window_size[1] - H % window_size[1]) % window_size[1]
+        pad_r = (window_size[2] - W % window_size[2]) % window_size[2]
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1))
+        _, Dp, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if any(i > 0 for i in shift_size):
+            shifted_x = torch.roll(
+                x,
+                shifts=(-shift_size[0], -shift_size[1], -shift_size[2]),
+                dims=(1, 2, 3))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x,
+                                     window_size)  # B*nW, Wd*Wh*Ww, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # B*nW, Wd*Wh*Ww, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, *(window_size + (C, )))
+        shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp,
+                                   Wp)  # B D' H' W' C
+        # reverse cyclic shift
+        if any(i > 0 for i in shift_size):
+            x = torch.roll(
+                shifted_x,
+                shifts=(shift_size[0], shift_size[1], shift_size[2]),
+                dims=(1, 2, 3))
+        else:
+            x = shifted_x
+
+        if pad_d1 > 0 or pad_r > 0 or pad_b > 0:
+            x = x[:, :D, :H, :W, :].contiguous()
+        return x
+
+    def forward_part2(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward function part2."""
+        return self.drop_path(self.mlp(self.norm2(x)))
+
+    def forward(self, x: torch.Tensor,
+                mask_matrix: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): Input features of shape :math:`(B, D, H, W, C)`.
+            mask_matrix (torch.Tensor): Attention mask for cyclic shift.
+        """
+
+        shortcut = x
+        if self.with_cp:
+            x = checkpoint.checkpoint(self.forward_part1, x, mask_matrix)
+        else:
+            x = self.forward_part1(x, mask_matrix)
+        x = shortcut + self.drop_path(x)
+
+        if self.with_cp:
+            x = x + checkpoint.checkpoint(self.forward_part2, x)
+        else:
+            x = x + self.forward_part2(x)
+
+        return x
+
+
+class PatchMerging(BaseModule):
+    """Patch Merging Layer.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        norm_cfg (dict): Config dict for norm layer.
+            Defaults to ``dict(type='LN')``.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 norm_cfg: Dict = dict(type='LN'),
+                 init_cfg: Optional[Dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.mid_embed_dims = 4 * embed_dims
+        self.out_embed_dims = 2 * embed_dims
+        self.reduction = nn.Linear(
+            self.mid_embed_dims, self.out_embed_dims, bias=False)
+        self.norm = build_norm_layer(norm_cfg, self.mid_embed_dims)[1]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Perform patch merging.
+
+        Args:
+            x (torch.Tensor): Input feature maps of shape
+                :math:`(B, D, H, W, C)`.
+
+        Returns:
+            torch.Tensor: The merged feature maps of shape
+                :math:`(B, D, H/2, W/2, 2*C)`.
+        """
+        B, D, H, W, C = x.shape
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
+        x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C
+        x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
+        x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B D H/2 W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(BaseModule):
+    """A basic Swin Transformer layer for one stage.
+
+    Args:
+        embed_dims (int): Number of feature channels.
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (Sequence[int]): Local window size.
+            Defaults to ``(8, 7, 7)``.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            Defaults to 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        drop (float): Dropout rate. Defaults to 0.0.
+        attn_drop (float): Attention dropout rate. Defaults to 0.0.
+        drop_paths (float or Sequence[float]): Stochastic depth rates.
+            Defaults to 0.0.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='GELU')``.
+        norm_cfg (dict, optional): Config dict for norm layer.
+            Defaults to ``dict(type='LN')``.
+        downsample (:class:`PatchMerging`, optional): Downsample layer
+            at the end of the layer. Defaults to None.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will
+            save some memory while slowing down the training speed.
+            Defaults to False.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 depth: int,
+                 num_heads: int,
+                 window_size: Sequence[int] = (8, 7, 7),
+                 mlp_ratio: float = 4.,
+                 qkv_bias: bool = True,
+                 qk_scale: Optional[float] = None,
+                 drop: float = 0.,
+                 attn_drop: float = 0.,
+                 drop_paths: Union[float, Sequence[float]] = 0.,
+                 act_cfg: Dict = dict(type='GELU'),
+                 norm_cfg: Dict = dict(type='LN'),
+                 downsample: Optional[PatchMerging] = None,
+                 with_cp: bool = False,
+                 init_cfg: Optional[Dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.window_size = window_size
+        self.shift_size = tuple(i // 2 for i in window_size)
+        self.depth = depth
+        self.with_cp = with_cp
+
+        if not isinstance(drop_paths, Sequence):
+            drop_paths = [drop_paths] * depth
+
+        # build blocks
+        self.blocks = ModuleList()
+        for i in range(depth):
+            _block_cfg = {
+                'embed_dims': embed_dims,
+                'num_heads': num_heads,
+                'window_size': window_size,
+                'shift_size': (0, 0, 0) if (i % 2 == 0) else self.shift_size,
+                'mlp_ratio': mlp_ratio,
+                'qkv_bias': qkv_bias,
+                'qk_scale': qk_scale,
+                'drop': drop,
+                'attn_drop': attn_drop,
+                'drop_path': drop_paths[i],
+                'act_cfg': act_cfg,
+                'norm_cfg': norm_cfg,
+                'with_cp': with_cp
+            }
+
+            block = SwinTransformerBlock3D(**_block_cfg)
+            self.blocks.append(block)
+
+        self.downsample = downsample
+        if self.downsample is not None:
+            self.downsample = downsample(
+                embed_dims=embed_dims, norm_cfg=norm_cfg)
+
+    def forward(self,
+                x: torch.Tensor,
+                do_downsample: bool = True) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): Input feature maps of shape
+                :math:`(B, C, D, H, W)`.
+            do_downsample (bool): Whether to downsample the output of
+                the current layer. Defaults to True.
+        """
+        # calculate attention mask for SW-MSA
+        B, C, D, H, W = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+        x = rearrange(x, 'b c d h w -> b d h w c')
+        Dp = int(np.ceil(D / window_size[0])) * window_size[0]
+        Hp = int(np.ceil(H / window_size[1])) * window_size[1]
+        Wp = int(np.ceil(W / window_size[2])) * window_size[2]
+        attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size, x.device)
+        for blk in self.blocks:
+            x = blk(x, attn_mask)
+
+        if self.downsample is not None and do_downsample:
+            x = self.downsample(x)
+        return x
+
+    @property
+    def out_embed_dims(self):
+        if self.downsample is not None:
+            return self.downsample.out_embed_dims
+        else:
+            return self.embed_dims
+
+
+class PatchEmbed3D(BaseModule):
+    """Video to Patch Embedding.
+
+    Args:
+        patch_size (Sequence[int] or int]): Patch token size.
+            Defaults to ``(2, 4, 4)``.
+        in_channels (int): Number of input video channels. Defaults to 3.
+        embed_dims (int): Dimensions of embedding. Defaults to 96.
+        conv_cfg: (dict): Config dict for convolution layer.
+            Defaults to ``dict(type='Conv3d')``.
+        norm_cfg (dict, optional): Config dict for norm layer.
+            Defaults to None.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 patch_size: Union[Sequence[int], int] = (2, 4, 4),
+                 in_channels: int = 3,
+                 embed_dims: int = 96,
+                 norm_cfg: Optional[Dict] = None,
+                 conv_cfg: Dict = dict(type='Conv3d'),
+                 init_cfg: Optional[Dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.embed_dims = embed_dims
+
+        self.proj = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            embed_dims,
+            kernel_size=patch_size,
+            stride=patch_size)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Perform video to patch embedding.
+
+        Args:
+            x (torch.Tensor): The input videos of shape
+                :math:`(B, C, D, H, W)`. In most cases, C is 3.
+
+        Returns:
+            torch.Tensor: The video patches of shape
+                :math:`(B, embed_dims, Dp, Hp, Wp)`.
+        """
+
+        _, _, D, H, W = x.size()
+        if W % self.patch_size[2] != 0:
+            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
+        if H % self.patch_size[1] != 0:
+            x = F.pad(x,
+                      (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
+        if D % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, 0, 0,
+                          self.patch_size[0] - D % self.patch_size[0]))
+
+        x = self.proj(x)  # B C Dp Wp Wp
+        if self.norm is not None:
+            Dp, Hp, Wp = x.size(2), x.size(3), x.size(4)
+            x = x.flatten(2).transpose(1, 2)  # B Dp*Hp*Wp C
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dims, Dp, Hp, Wp)
+
+        return x
+
+
+@MODELS.register_module()
+class SwinTransformer3D(BaseModule):
+    """Video Swin Transformer backbone.
+
+    A pytorch implement of: `Video Swin Transformer
+    <https://arxiv.org/abs/2106.13230>`_
+
+    Args:
+        arch (str or dict): Video Swin Transformer architecture. If use string,
+            choose from 'tiny', 'small', 'base' and 'large'. If use dict, it
+            should have below keys:
+            - **embed_dims** (int): The dimensions of embedding.
+            - **depths** (Sequence[int]): The number of blocks in each stage.
+            - **num_heads** (Sequence[int]): The number of heads in attention
+            modules of each stage.
+        pretrained (str, optional): Name of pretrained model.
+            Defaults to None.
+        pretrained2d (bool): Whether to load pretrained 2D model.
+            Defaults to True.
+        patch_size (int or Sequence(int)): Patch size.
+            Defaults to ``(2, 4, 4)``.
+        in_channels (int): Number of input image channels. Defaults to 3.
+        window_size (Sequence[int]): Window size. Defaults to ``(8, 7, 7)``.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            Defaults to 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        drop_rate (float): Dropout rate. Defaults to 0.0.
+        attn_drop_rate (float): Attention dropout rate. Defaults to 0.0.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.1.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='GELU')``.
+        norm_cfg (dict): Config dict for norm layer.
+            Defaults to ``dict(type='LN')``.
+        patch_norm (bool): If True, add normalization after patch embedding.
+            Defaults to True.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Defaults to -1.
+        with_cp (bool): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Defaults to False.
+        out_indices (Sequence[int]): Indices of output feature.
+            Defaults to ``(3, )``.
+        out_after_downsample (bool): Whether to output the feature map of a
+            stage after the following downsample layer. Defaults to False.
+        init_cfg (dict or list[dict]): Initialization config dict. Defaults to
+            ``[
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+            ]``.
+    """
+    arch_zoo = {
+        **dict.fromkeys(['t', 'tiny'],
+                        {'embed_dims': 96,
+                         'depths': [2, 2, 6, 2],
+                         'num_heads': [3, 6, 12, 24]}),
+        **dict.fromkeys(['s', 'small'],
+                        {'embed_dims': 96,
+                         'depths': [2, 2, 18, 2],
+                         'num_heads': [3, 6, 12, 24]}),
+        **dict.fromkeys(['b', 'base'],
+                        {'embed_dims': 128,
+                         'depths': [2, 2, 18, 2],
+                         'num_heads': [4, 8, 16, 32]}),
+        **dict.fromkeys(['l', 'large'],
+                        {'embed_dims': 192,
+                         'depths': [2, 2, 18, 2],
+                         'num_heads': [6, 12, 24, 48]}),
+    }  # yapf: disable
+
+    def __init__(
+        self,
+        arch: Union[str, Dict],
+        pretrained: Optional[str] = None,
+        pretrained2d: bool = True,
+        patch_size: Union[int, Sequence[int]] = (2, 4, 4),
+        in_channels: int = 3,
+        window_size: Sequence[int] = (8, 7, 7),
+        mlp_ratio: float = 4.,
+        qkv_bias: bool = True,
+        qk_scale: Optional[float] = None,
+        drop_rate: float = 0.,
+        attn_drop_rate: float = 0.,
+        drop_path_rate: float = 0.1,
+        act_cfg: Dict = dict(type='GELU'),
+        norm_cfg: Dict = dict(type='LN'),
+        patch_norm: bool = True,
+        frozen_stages: int = -1,
+        with_cp: bool = False,
+        out_indices: Sequence[int] = (3, ),
+        out_after_downsample: bool = False,
+        init_cfg: Optional[Union[Dict, List[Dict]]] = [
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.pretrained = pretrained
+        self.pretrained2d = pretrained2d
+
+        if isinstance(arch, str):
+            arch = arch.lower()
+            assert arch in set(self.arch_zoo), \
+                f'Arch {arch} is not in default archs {set(self.arch_zoo)}'
+            self.arch_settings = self.arch_zoo[arch]
+        else:
+            essential_keys = {'embed_dims', 'depths', 'num_heads'}
+            assert isinstance(arch, dict) and set(arch) == essential_keys, \
+                f'Custom arch needs a dict with keys {essential_keys}'
+            self.arch_settings = arch
+
+        self.embed_dims = self.arch_settings['embed_dims']
+        self.depths = self.arch_settings['depths']
+        self.num_heads = self.arch_settings['num_heads']
+        assert len(self.depths) == len(self.num_heads)
+        self.num_layers = len(self.depths)
+        assert 1 <= self.num_layers <= 4
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_layers
+        self.out_after_downsample = out_after_downsample
+        self.frozen_stages = frozen_stages
+        self.window_size = window_size
+        self.patch_size = patch_size
+
+        _patch_cfg = {
+            'patch_size': patch_size,
+            'in_channels': in_channels,
+            'embed_dims': self.embed_dims,
+            'norm_cfg': norm_cfg if patch_norm else None,
+            'conv_cfg': dict(type='Conv3d')
+        }
+        self.patch_embed = PatchEmbed3D(**_patch_cfg)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        total_depth = sum(self.depths)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = ModuleList()
+        embed_dims = [self.embed_dims]
+        for i, (depth, num_heads) in \
+                enumerate(zip(self.depths, self.num_heads)):
+            downsample = PatchMerging if i < self.num_layers - 1 else None
+            _layer_cfg = {
+                'embed_dims': embed_dims[-1],
+                'depth': depth,
+                'num_heads': num_heads,
+                'window_size': window_size,
+                'mlp_ratio': mlp_ratio,
+                'qkv_bias': qkv_bias,
+                'qk_scale': qk_scale,
+                'drop': drop_rate,
+                'attn_drop': attn_drop_rate,
+                'drop_paths': dpr[:depth],
+                'act_cfg': act_cfg,
+                'norm_cfg': norm_cfg,
+                'downsample': downsample,
+                'with_cp': with_cp
+            }
+
+            layer = BasicLayer(**_layer_cfg)
+            self.layers.append(layer)
+
+            dpr = dpr[depth:]
+            embed_dims.append(layer.out_embed_dims)
+
+        if self.out_after_downsample:
+            self.num_features = embed_dims[1:]
+        else:
+            self.num_features = embed_dims[:-1]
+
+        for i in out_indices:
+            if norm_cfg is not None:
+                norm_layer = build_norm_layer(norm_cfg,
+                                              self.num_features[i])[1]
+            else:
+                norm_layer = nn.Identity()
+
+            self.add_module(f'norm{i}', norm_layer)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self) -> None:
+        """Prevent all the parameters from being optimized before
+        ``self.frozen_stages``."""
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def inflate_weights(self, logger: MMLogger) -> None:
+        """Inflate the swin2d parameters to swin3d.
+
+        The differences between swin3d and swin2d mainly lie in an extra
+        axis. To utilize the pretrained parameters in 2d model, the weight
+        of swin2d models should be inflated to fit in the shapes of the
+        3d counterpart.
+
+        Args:
+            logger (MMLogger): The logger used to print debugging information.
+        """
+        checkpoint = _load_checkpoint(self.pretrained, map_location='cpu')
+        state_dict = checkpoint['model']
+
+        # delete relative_position_index since we always re-init it
+        relative_position_index_keys = [
+            k for k in state_dict.keys() if 'relative_position_index' in k
+        ]
+        for k in relative_position_index_keys:
+            del state_dict[k]
+
+        # delete attn_mask since we always re-init it
+        attn_mask_keys = [k for k in state_dict.keys() if 'attn_mask' in k]
+        for k in attn_mask_keys:
+            del state_dict[k]
+        state_dict['patch_embed.proj.weight'] = \
+            state_dict['patch_embed.proj.weight'].unsqueeze(2).\
+            repeat(1, 1, self.patch_size[0], 1, 1) / self.patch_size[0]
+
+        # bicubic interpolate relative_position_bias_table if not match
+        relative_position_bias_table_keys = [
+            k for k in state_dict.keys() if 'relative_position_bias_table' in k
+        ]
+        for k in relative_position_bias_table_keys:
+            relative_position_bias_table_pretrained = state_dict[k]
+            relative_position_bias_table_current = self.state_dict()[k]
+            L1, nH1 = relative_position_bias_table_pretrained.size()
+            L2, nH2 = relative_position_bias_table_current.size()
+            L2 = (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
+            wd = self.window_size[0]
+            if nH1 != nH2:
+                logger.warning(f'Error in loading {k}, passing')
+            else:
+                if L1 != L2:
+                    S1 = int(L1**0.5)
+                    relative_position_bias_table_pretrained_resized = \
+                        torch.nn.functional.interpolate(
+                            relative_position_bias_table_pretrained.permute(
+                                1, 0).view(1, nH1, S1, S1),
+                            size=(2 * self.window_size[1] - 1,
+                                  2 * self.window_size[2] - 1),
+                            mode='bicubic')
+                    relative_position_bias_table_pretrained = \
+                        relative_position_bias_table_pretrained_resized. \
+                        view(nH2, L2).permute(1, 0)
+            state_dict[k] = relative_position_bias_table_pretrained.repeat(
+                2 * wd - 1, 1)
+
+        # In the original swin2d checkpoint, the last layer of the
+        # backbone is the norm layer, and the original attribute
+        # name is `norm`. We changed it to `norm3` which means it
+        # is the last norm layer of stage 4.
+        if hasattr(self, 'norm3'):
+            state_dict['norm3.weight'] = state_dict['norm.weight']
+            state_dict['norm3.bias'] = state_dict['norm.bias']
+            del state_dict['norm.weight']
+            del state_dict['norm.bias']
+
+        msg = self.load_state_dict(state_dict, strict=False)
+        logger.info(msg)
+
+    def init_weights(self) -> None:
+        """Initialize the weights in backbone."""
+        if self.pretrained2d:
+            logger = MMLogger.get_current_instance()
+            logger.info(f'load model from: {self.pretrained}')
+            # Inflate 2D model into 3D model.
+            self.inflate_weights(logger)
+        else:
+            if self.pretrained:
+                self.init_cfg = dict(
+                    type='Pretrained', checkpoint=self.pretrained)
+            super().init_weights()
+
+    def forward(self, x: torch.Tensor) -> \
+            Union[Tuple[torch.Tensor], torch.Tensor]:
+        """Forward function for Swin3d Transformer."""
+        x = self.patch_embed(x)
+
+        x = self.pos_drop(x)
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x.contiguous(), do_downsample=self.out_after_downsample)
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                out = norm_layer(x)
+                out = rearrange(out, 'b d h w c -> b c d h w').contiguous()
+                outs.append(out)
+
+            if layer.downsample is not None and not self.out_after_downsample:
+                x = layer.downsample(x)
+
+            if i < self.num_layers - 1:
+                x = rearrange(x, 'b d h w c -> b c d h w')
+
+        if len(outs) == 1:
+            return outs[0]
+
+        return tuple(outs)
+
+    def train(self, mode: bool = True) -> None:
+        """Convert the model into training mode while keep layers frozen."""
+        super(SwinTransformer3D, self).train(mode)
+        self._freeze_stages()
diff --git a/mmaction/models/backbones/tanet.py b/mmaction/models/backbones/tanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..95575f4fbcbb50f77f1868a42ef0c336cc70b722
--- /dev/null
+++ b/mmaction/models/backbones/tanet.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch.utils import checkpoint as cp
+
+from mmaction.registry import MODELS
+from ..common import TAM
+from .resnet import Bottleneck, ResNet
+
+
+class TABlock(nn.Module):
+    """Temporal Adaptive Block (TA-Block) for TANet.
+
+    This block is proposed in `TAM: TEMPORAL ADAPTIVE MODULE FOR VIDEO
+    RECOGNITION <https://arxiv.org/pdf/2005.06803>`_
+
+    The temporal adaptive module (TAM) is embedded into ResNet-Block
+    after the first Conv2D, which turns the vanilla ResNet-Block
+    into TA-Block.
+
+    Args:
+        block (nn.Module): Residual blocks to be substituted.
+        num_segments (int): Number of frame segments.
+        tam_cfg (dict): Config for temporal adaptive module (TAM).
+    """
+
+    def __init__(self, block: nn.Module, num_segments: int,
+                 tam_cfg: dict) -> None:
+        super().__init__()
+        self.tam_cfg = deepcopy(tam_cfg)
+        self.block = block
+        self.num_segments = num_segments
+        self.tam = TAM(
+            in_channels=block.conv1.out_channels,
+            num_segments=num_segments,
+            **self.tam_cfg)
+
+        if not isinstance(self.block, Bottleneck):
+            raise NotImplementedError('TA-Blocks have not been fully '
+                                      'implemented except the pattern based '
+                                      'on Bottleneck block.')
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        assert isinstance(self.block, Bottleneck)
+
+        def _inner_forward(x):
+            """Forward wrapper for utilizing checkpoint."""
+            identity = x
+
+            out = self.block.conv1(x)
+            out = self.tam(out)
+            out = self.block.conv2(out)
+            out = self.block.conv3(out)
+
+            if self.block.downsample is not None:
+                identity = self.block.downsample(x)
+
+            out = out + identity
+
+            return out
+
+        if self.block.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.block.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class TANet(ResNet):
+    """Temporal Adaptive Network (TANet) backbone.
+
+    This backbone is proposed in `TAM: TEMPORAL ADAPTIVE MODULE FOR VIDEO
+    RECOGNITION <https://arxiv.org/pdf/2005.06803>`_
+
+    Embedding the temporal adaptive module (TAM) into ResNet to
+    instantiate TANet.
+
+    Args:
+        depth (int): Depth of resnet, from ``{18, 34, 50, 101, 152}``.
+        num_segments (int): Number of frame segments.
+        tam_cfg (dict, optional): Config for temporal adaptive module (TAM).
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 depth: int,
+                 num_segments: int,
+                 tam_cfg: Optional[dict] = None,
+                 **kwargs) -> None:
+        super().__init__(depth, **kwargs)
+        assert num_segments >= 3
+        self.num_segments = num_segments
+        tam_cfg = dict() if tam_cfg is None else tam_cfg
+        self.tam_cfg = deepcopy(tam_cfg)
+        super().init_weights()
+        self.make_tam_modeling()
+
+    def init_weights(self):
+        """Initialize weights."""
+        pass
+
+    def make_tam_modeling(self):
+        """Replace ResNet-Block with TA-Block."""
+
+        def make_tam_block(stage, num_segments, tam_cfg=dict()):
+            blocks = list(stage.children())
+            for i, block in enumerate(blocks):
+                blocks[i] = TABlock(block, num_segments, deepcopy(tam_cfg))
+            return nn.Sequential(*blocks)
+
+        for i in range(self.num_stages):
+            layer_name = f'layer{i + 1}'
+            res_layer = getattr(self, layer_name)
+            setattr(self, layer_name,
+                    make_tam_block(res_layer, self.num_segments, self.tam_cfg))
diff --git a/mmaction/models/backbones/timesformer.py b/mmaction/models/backbones/timesformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a051282fa65049655146064413c69d3d5b98c7b1
--- /dev/null
+++ b/mmaction/models/backbones/timesformer.py
@@ -0,0 +1,294 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
+from mmengine import ConfigDict
+from mmengine.logging import MMLogger
+from mmengine.model.weight_init import kaiming_init, trunc_normal_
+from mmengine.runner.checkpoint import _load_checkpoint, load_state_dict
+from torch.nn.modules.utils import _pair
+
+from mmaction.registry import MODELS
+
+
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding.
+
+    Args:
+        img_size (int | tuple): Size of input image.
+        patch_size (int): Size of one patch.
+        in_channels (int): Channel num of input features. Defaults to 3.
+        embed_dims (int): Dimensions of embedding. Defaults to 768.
+        conv_cfg (dict | None): Config dict for convolution layer. Defaults to
+            `dict(type='Conv2d')`.
+    """
+
+    def __init__(self,
+                 img_size,
+                 patch_size,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_cfg=dict(type='Conv2d')):
+        super().__init__()
+        self.img_size = _pair(img_size)
+        self.patch_size = _pair(patch_size)
+
+        num_patches = (self.img_size[1] // self.patch_size[1]) * (
+            self.img_size[0] // self.patch_size[0])
+        assert num_patches * self.patch_size[0] * self.patch_size[1] == \
+               self.img_size[0] * self.img_size[1], \
+               'The image size H*W must be divisible by patch size'
+        self.num_patches = num_patches
+
+        # Use conv layer to embed
+        self.projection = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            embed_dims,
+            kernel_size=patch_size,
+            stride=patch_size)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize weights."""
+        # Lecun norm from ClassyVision
+        kaiming_init(self.projection, mode='fan_in', nonlinearity='linear')
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+
+        Returns:
+            Tensor: The output of the module.
+        """
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.projection(x).flatten(2).transpose(1, 2)
+        return x
+
+
+@MODELS.register_module()
+class TimeSformer(nn.Module):
+    """TimeSformer. A PyTorch impl of `Is Space-Time Attention All You Need for
+    Video Understanding? <https://arxiv.org/abs/2102.05095>`_
+
+    Args:
+        num_frames (int): Number of frames in the video.
+        img_size (int | tuple): Size of input image.
+        patch_size (int): Size of one patch.
+        pretrained (str | None): Name of pretrained model. Default: None.
+        embed_dims (int): Dimensions of embedding. Defaults to 768.
+        num_heads (int): Number of parallel attention heads in
+            TransformerCoder. Defaults to 12.
+        num_transformer_layers (int): Number of transformer layers. Defaults to
+            12.
+        in_channels (int): Channel num of input features. Defaults to 3.
+        dropout_ratio (float): Probability of dropout layer. Defaults to 0..
+        transformer_layers (list[obj:`mmcv.ConfigDict`] |
+            obj:`mmcv.ConfigDict` | None): Config of transformerlayer in
+            TransformerCoder. If it is obj:`mmcv.ConfigDict`, it would be
+            repeated `num_transformer_layers` times to a
+            list[obj:`mmcv.ConfigDict`]. Defaults to None.
+        attention_type (str): Type of attentions in TransformerCoder. Choices
+            are 'divided_space_time', 'space_only' and 'joint_space_time'.
+            Defaults to 'divided_space_time'.
+        norm_cfg (dict): Config for norm layers. Defaults to
+            `dict(type='LN', eps=1e-6)`.
+    """
+    supported_attention_types = [
+        'divided_space_time', 'space_only', 'joint_space_time'
+    ]
+
+    def __init__(self,
+                 num_frames,
+                 img_size,
+                 patch_size,
+                 pretrained=None,
+                 embed_dims=768,
+                 num_heads=12,
+                 num_transformer_layers=12,
+                 in_channels=3,
+                 dropout_ratio=0.,
+                 transformer_layers=None,
+                 attention_type='divided_space_time',
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 **kwargs):
+        super().__init__(**kwargs)
+        assert attention_type in self.supported_attention_types, (
+            f'Unsupported Attention Type {attention_type}!')
+        assert transformer_layers is None or isinstance(
+            transformer_layers, (dict, list))
+
+        self.num_frames = num_frames
+        self.pretrained = pretrained
+        self.embed_dims = embed_dims
+        self.num_transformer_layers = num_transformer_layers
+        self.attention_type = attention_type
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dims=embed_dims)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + 1, embed_dims))
+        self.drop_after_pos = nn.Dropout(p=dropout_ratio)
+        if self.attention_type != 'space_only':
+            self.time_embed = nn.Parameter(
+                torch.zeros(1, num_frames, embed_dims))
+            self.drop_after_time = nn.Dropout(p=dropout_ratio)
+
+        self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        if transformer_layers is None:
+            # stochastic depth decay rule
+            dpr = np.linspace(0, 0.1, num_transformer_layers)
+
+            if self.attention_type == 'divided_space_time':
+                _transformerlayers_cfg = [
+                    dict(
+                        type='BaseTransformerLayer',
+                        attn_cfgs=[
+                            dict(
+                                type='DividedTemporalAttentionWithNorm',
+                                embed_dims=embed_dims,
+                                num_heads=num_heads,
+                                num_frames=num_frames,
+                                dropout_layer=dict(
+                                    type='DropPath', drop_prob=dpr[i]),
+                                norm_cfg=dict(type='LN', eps=1e-6)),
+                            dict(
+                                type='DividedSpatialAttentionWithNorm',
+                                embed_dims=embed_dims,
+                                num_heads=num_heads,
+                                num_frames=num_frames,
+                                dropout_layer=dict(
+                                    type='DropPath', drop_prob=dpr[i]),
+                                norm_cfg=dict(type='LN', eps=1e-6))
+                        ],
+                        ffn_cfgs=dict(
+                            type='FFNWithNorm',
+                            embed_dims=embed_dims,
+                            feedforward_channels=embed_dims * 4,
+                            num_fcs=2,
+                            act_cfg=dict(type='GELU'),
+                            dropout_layer=dict(
+                                type='DropPath', drop_prob=dpr[i]),
+                            norm_cfg=dict(type='LN', eps=1e-6)),
+                        operation_order=('self_attn', 'self_attn', 'ffn'))
+                    for i in range(num_transformer_layers)
+                ]
+            else:
+                # Sapce Only & Joint Space Time
+                _transformerlayers_cfg = [
+                    dict(
+                        type='BaseTransformerLayer',
+                        attn_cfgs=[
+                            dict(
+                                type='MultiheadAttention',
+                                embed_dims=embed_dims,
+                                num_heads=num_heads,
+                                batch_first=True,
+                                dropout_layer=dict(
+                                    type='DropPath', drop_prob=dpr[i]))
+                        ],
+                        ffn_cfgs=dict(
+                            type='FFN',
+                            embed_dims=embed_dims,
+                            feedforward_channels=embed_dims * 4,
+                            num_fcs=2,
+                            act_cfg=dict(type='GELU'),
+                            dropout_layer=dict(
+                                type='DropPath', drop_prob=dpr[i])),
+                        operation_order=('norm', 'self_attn', 'norm', 'ffn'),
+                        norm_cfg=dict(type='LN', eps=1e-6),
+                        batch_first=True)
+                    for i in range(num_transformer_layers)
+                ]
+
+            transformer_layers = ConfigDict(
+                dict(
+                    type='TransformerLayerSequence',
+                    transformerlayers=_transformerlayers_cfg,
+                    num_layers=num_transformer_layers))
+
+        self.transformer_layers = build_transformer_layer_sequence(
+            transformer_layers)
+
+    def init_weights(self, pretrained=None):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+
+        if pretrained:
+            self.pretrained = pretrained
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            logger.info(f'load model from: {self.pretrained}')
+
+            state_dict = _load_checkpoint(self.pretrained, map_location='cpu')
+            if 'state_dict' in state_dict:
+                state_dict = state_dict['state_dict']
+
+            if self.attention_type == 'divided_space_time':
+                # modify the key names of norm layers
+                old_state_dict_keys = list(state_dict.keys())
+                for old_key in old_state_dict_keys:
+                    if 'norms' in old_key:
+                        new_key = old_key.replace('norms.0',
+                                                  'attentions.0.norm')
+                        new_key = new_key.replace('norms.1', 'ffns.0.norm')
+                        state_dict[new_key] = state_dict.pop(old_key)
+
+                # copy the parameters of space attention to time attention
+                old_state_dict_keys = list(state_dict.keys())
+                for old_key in old_state_dict_keys:
+                    if 'attentions.0' in old_key:
+                        new_key = old_key.replace('attentions.0',
+                                                  'attentions.1')
+                        state_dict[new_key] = state_dict[old_key].clone()
+
+            load_state_dict(self, state_dict, strict=False, logger=logger)
+
+    def forward(self, x):
+        """Defines the computation performed at every call."""
+        # x [batch_size * num_frames, num_patches, embed_dims]
+        batches = x.shape[0]
+        x = self.patch_embed(x)
+
+        # x [batch_size * num_frames, num_patches + 1, embed_dims]
+        cls_tokens = self.cls_token.expand(x.size(0), -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed
+        x = self.drop_after_pos(x)
+
+        # Add Time Embedding
+        if self.attention_type != 'space_only':
+            # x [batch_size, num_patches * num_frames + 1, embed_dims]
+            cls_tokens = x[:batches, 0, :].unsqueeze(1)
+            x = rearrange(x[:, 1:, :], '(b t) p m -> (b p) t m', b=batches)
+            x = x + self.time_embed
+            x = self.drop_after_time(x)
+            x = rearrange(x, '(b p) t m -> b (p t) m', b=batches)
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        x = self.transformer_layers(x, None, None)
+
+        if self.attention_type == 'space_only':
+            # x [batch_size, num_patches + 1, embed_dims]
+            x = x.view(-1, self.num_frames, *x.size()[-2:])
+            x = torch.mean(x, 1)
+
+        x = self.norm(x)
+
+        # Return Class Token
+        return x[:, 0]
diff --git a/mmaction/models/backbones/uniformer.py b/mmaction/models/backbones/uniformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..78d0cd80c4f072aa3e958161e91f148736a971f5
--- /dev/null
+++ b/mmaction/models/backbones/uniformer.py
@@ -0,0 +1,669 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn.bricks import DropPath
+from mmengine.logging import MMLogger
+from mmengine.model import BaseModule, ModuleList
+from mmengine.runner.checkpoint import _load_checkpoint
+from mmengine.utils import to_2tuple
+
+from mmaction.registry import MODELS
+
+logger = MMLogger.get_current_instance()
+
+MODEL_PATH = 'https://download.openmmlab.com/mmaction/v1.0/recognition'
+_MODELS = {
+    'uniformer_small_in1k':
+    os.path.join(MODEL_PATH,
+                 'uniformerv1/uniformer_small_in1k_20221219-fe0a7ae0.pth'),
+    'uniformer_base_in1k':
+    os.path.join(MODEL_PATH,
+                 'uniformerv1/uniformer_base_in1k_20221219-82c01015.pth'),
+}
+
+
+def conv_3xnxn(inp: int,
+               oup: int,
+               kernel_size: int = 3,
+               stride: int = 3,
+               groups: int = 1):
+    """3D convolution with kernel size of 3xnxn.
+
+    Args:
+        inp (int): Dimension of input features.
+        oup (int): Dimension of output features.
+        kernel_size (int): The spatial kernel size (i.e., n).
+            Defaults to 3.
+        stride (int): The spatial stride.
+            Defaults to 3.
+        groups (int): Group number of operated features.
+            Defaults to 1.
+    """
+    return nn.Conv3d(
+        inp,
+        oup, (3, kernel_size, kernel_size), (2, stride, stride), (1, 0, 0),
+        groups=groups)
+
+
+def conv_1xnxn(inp: int,
+               oup: int,
+               kernel_size: int = 3,
+               stride: int = 3,
+               groups: int = 1):
+    """3D convolution with kernel size of 1xnxn.
+
+    Args:
+        inp (int): Dimension of input features.
+        oup (int): Dimension of output features.
+        kernel_size (int): The spatial kernel size (i.e., n).
+            Defaults to 3.
+        stride (int): The spatial stride.
+            Defaults to 3.
+        groups (int): Group number of operated features.
+            Defaults to 1.
+    """
+    return nn.Conv3d(
+        inp,
+        oup, (1, kernel_size, kernel_size), (1, stride, stride), (0, 0, 0),
+        groups=groups)
+
+
+def conv_1x1x1(inp: int, oup: int, groups: int = 1):
+    """3D convolution with kernel size of 1x1x1.
+
+    Args:
+        inp (int): Dimension of input features.
+        oup (int): Dimension of output features.
+        groups (int): Group number of operated features.
+            Defaults to 1.
+    """
+    return nn.Conv3d(inp, oup, (1, 1, 1), (1, 1, 1), (0, 0, 0), groups=groups)
+
+
+def conv_3x3x3(inp: int, oup: int, groups: int = 1):
+    """3D convolution with kernel size of 3x3x3.
+
+    Args:
+        inp (int): Dimension of input features.
+        oup (int): Dimension of output features.
+        groups (int): Group number of operated features.
+            Defaults to 1.
+    """
+    return nn.Conv3d(inp, oup, (3, 3, 3), (1, 1, 1), (1, 1, 1), groups=groups)
+
+
+def conv_5x5x5(inp: int, oup: int, groups: int = 1):
+    """3D convolution with kernel size of 5x5x5.
+
+    Args:
+        inp (int): Dimension of input features.
+        oup (int): Dimension of output features.
+        groups (int): Group number of operated features.
+            Defaults to 1.
+    """
+    return nn.Conv3d(inp, oup, (5, 5, 5), (1, 1, 1), (2, 2, 2), groups=groups)
+
+
+def bn_3d(dim):
+    """3D batch normalization.
+
+    Args:
+        dim (int): Dimension of input features.
+    """
+    return nn.BatchNorm3d(dim)
+
+
+class Mlp(BaseModule):
+    """Multilayer perceptron.
+
+    Args:
+        in_features (int): Number of input features.
+        hidden_features (int): Number of hidden features.
+            Defaults to None.
+        out_features (int): Number of output features.
+            Defaults to None.
+        drop (float): Dropout rate. Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int = None,
+        out_features: int = None,
+        drop: float = 0.,
+        init_cfg: Optional[dict] = None,
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(BaseModule):
+    """Self-Attention.
+
+    Args:
+        dim (int): Number of input features.
+        num_heads (int): Number of attention heads.
+            Defaults to 8.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        attn_drop (float): Attention dropout rate.
+            Defaults to 0.0.
+        proj_drop (float): Dropout rate.
+            Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+        init_cfg (dict, optional): The config of weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        qk_scale: float = None,
+        attn_drop: float = 0.,
+        proj_drop: float = 0.,
+        init_cfg: Optional[dict] = None,
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version,
+        # can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class CMlp(BaseModule):
+    """Multilayer perceptron via convolution.
+
+    Args:
+        in_features (int): Number of input features.
+        hidden_features (int): Number of hidden features.
+            Defaults to None.
+        out_features (int): Number of output features.
+            Defaults to None.
+        drop (float): Dropout rate. Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        drop=0.,
+        init_cfg: Optional[dict] = None,
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = conv_1x1x1(in_features, hidden_features)
+        self.act = nn.GELU()
+        self.fc2 = conv_1x1x1(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class CBlock(BaseModule):
+    """Convolution Block.
+
+    Args:
+        dim (int): Number of input features.
+        mlp_ratio (float): Ratio of mlp hidden dimension
+            to embedding dimension. Defaults to 4.
+        drop (float): Dropout rate.
+            Defaults to 0.0.
+        drop_paths (float): Stochastic depth rates.
+            Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        mlp_ratio: float = 4.,
+        drop: float = 0.,
+        drop_path: float = 0.,
+        init_cfg: Optional[dict] = None,
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.pos_embed = conv_3x3x3(dim, dim, groups=dim)
+        self.norm1 = bn_3d(dim)
+        self.conv1 = conv_1x1x1(dim, dim, 1)
+        self.conv2 = conv_1x1x1(dim, dim, 1)
+        self.attn = conv_5x5x5(dim, dim, groups=dim)
+        # NOTE: drop path for stochastic depth,
+        # we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = bn_3d(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = CMlp(
+            in_features=dim, hidden_features=mlp_hidden_dim, drop=drop)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.pos_embed(x)
+        x = x + self.drop_path(
+            self.conv2(self.attn(self.conv1(self.norm1(x)))))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class SABlock(BaseModule):
+    """Self-Attention Block.
+
+    Args:
+        dim (int): Number of input features.
+        num_heads (int): Number of attention heads.
+        mlp_ratio (float): Ratio of mlp hidden dimension
+            to embedding dimension. Defaults to 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        drop (float): Dropout rate. Defaults to 0.0.
+        attn_drop (float): Attention dropout rate. Defaults to 0.0.
+        drop_paths (float): Stochastic depth rates.
+            Defaults to 0.0.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.,
+        qkv_bias: bool = False,
+        qk_scale: float = None,
+        drop: float = 0.,
+        attn_drop: float = 0.,
+        drop_path: float = 0.,
+        init_cfg: Optional[dict] = None,
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.pos_embed = conv_3x3x3(dim, dim, groups=dim)
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        # NOTE: drop path for stochastic depth,
+        # we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = nn.LayerNorm(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim, hidden_features=mlp_hidden_dim, drop=drop)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.pos_embed(x)
+        B, C, T, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        x = x.transpose(1, 2).reshape(B, C, T, H, W)
+        return x
+
+
+class SpeicalPatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    Add extra temporal downsampling via temporal kernel size of 3.
+
+    Args:
+        img_size (int): Number of input size.
+            Defaults to 224.
+        patch_size (int): Number of patch size.
+            Defaults to 16.
+        in_chans (int): Number of input features.
+            Defaults to 3.
+        embed_dim (int): Number of output features.
+            Defaults to 768.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        init_cfg: Optional[dict] = None,
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (
+            img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.norm = nn.LayerNorm(embed_dim)
+        self.proj = conv_3xnxn(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size[0],
+            stride=patch_size[0])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        B, _, T, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        x = x.reshape(B, T, H, W, -1).permute(0, 4, 1, 2, 3).contiguous()
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    Args:
+        img_size (int): Number of input size.
+            Defaults to 224.
+        patch_size (int): Number of patch size.
+            Defaults to 16.
+        in_chans (int): Number of input features.
+            Defaults to 3.
+        embed_dim (int): Number of output features.
+            Defaults to 768.
+        init_cfg (dict, optional): Config dict for initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        init_cfg: Optional[dict] = None,
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (
+            img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.norm = nn.LayerNorm(embed_dim)
+        self.proj = conv_1xnxn(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size[0],
+            stride=patch_size[0])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        B, _, T, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        x = x.reshape(B, T, H, W, -1).permute(0, 4, 1, 2, 3).contiguous()
+        return x
+
+
+@MODELS.register_module()
+class UniFormer(BaseModule):
+    """UniFormer.
+
+    A pytorch implement of: `UniFormer: Unified Transformer
+    for Efficient Spatiotemporal Representation Learning
+    <https://arxiv.org/abs/2201.04676>`
+
+    Args:
+        depth (List[int]): List of depth in each stage.
+            Defaults to [5, 8, 20, 7].
+        img_size (int): Number of input size.
+            Defaults to 224.
+        in_chans (int): Number of input features.
+            Defaults to 3.
+        head_dim (int): Dimension of attention head.
+            Defaults to 64.
+        embed_dim (List[int]): List of embedding dimension in each layer.
+            Defaults to [64, 128, 320, 512].
+        mlp_ratio (float): Ratio of mlp hidden dimension
+            to embedding dimension. Defaults to 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        drop_rate (float): Dropout rate. Defaults to 0.0.
+        attn_drop_rate (float): Attention dropout rate. Defaults to 0.0.
+        drop_path_rate (float): Stochastic depth rates.
+            Defaults to 0.0.
+        pretrained2d (bool): Whether to load pretrained from 2D model.
+            Defaults to True.
+        pretrained (str): Name of pretrained model.
+            Defaults to None.
+        init_cfg (dict or list[dict]): Initialization config dict. Defaults to
+            ``[
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+            ]``.
+    """
+
+    def __init__(
+        self,
+        depth: List[int] = [5, 8, 20, 7],
+        img_size: int = 224,
+        in_chans: int = 3,
+        embed_dim: List[int] = [64, 128, 320, 512],
+        head_dim: int = 64,
+        mlp_ratio: float = 4.,
+        qkv_bias: bool = True,
+        qk_scale: float = None,
+        drop_rate: float = 0.,
+        attn_drop_rate: float = 0.,
+        drop_path_rate: float = 0.,
+        pretrained2d: bool = True,
+        pretrained: Optional[str] = None,
+        init_cfg: Optional[Union[Dict, List[Dict]]] = [
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.pretrained = pretrained
+        self.pretrained2d = pretrained2d
+        self.patch_embed1 = SpeicalPatchEmbed(
+            img_size=img_size,
+            patch_size=4,
+            in_chans=in_chans,
+            embed_dim=embed_dim[0])
+        self.patch_embed2 = PatchEmbed(
+            img_size=img_size // 4,
+            patch_size=2,
+            in_chans=embed_dim[0],
+            embed_dim=embed_dim[1])
+        self.patch_embed3 = PatchEmbed(
+            img_size=img_size // 8,
+            patch_size=2,
+            in_chans=embed_dim[1],
+            embed_dim=embed_dim[2])
+        self.patch_embed4 = PatchEmbed(
+            img_size=img_size // 16,
+            patch_size=2,
+            in_chans=embed_dim[2],
+            embed_dim=embed_dim[3])
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depth))
+        ]  # stochastic depth decay rule
+        num_heads = [dim // head_dim for dim in embed_dim]
+        self.blocks1 = ModuleList([
+            CBlock(
+                dim=embed_dim[0],
+                mlp_ratio=mlp_ratio,
+                drop=drop_rate,
+                drop_path=dpr[i]) for i in range(depth[0])
+        ])
+        self.blocks2 = ModuleList([
+            CBlock(
+                dim=embed_dim[1],
+                mlp_ratio=mlp_ratio,
+                drop=drop_rate,
+                drop_path=dpr[i + depth[0]]) for i in range(depth[1])
+        ])
+        self.blocks3 = ModuleList([
+            SABlock(
+                dim=embed_dim[2],
+                num_heads=num_heads[2],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i + depth[0] + depth[1]])
+            for i in range(depth[2])
+        ])
+        self.blocks4 = ModuleList([
+            SABlock(
+                dim=embed_dim[3],
+                num_heads=num_heads[3],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i + depth[0] + depth[1] + depth[2]])
+            for i in range(depth[3])
+        ])
+        self.norm = bn_3d(embed_dim[-1])
+
+    def _inflate_weight(self,
+                        weight_2d: torch.Tensor,
+                        time_dim: int,
+                        center: bool = True) -> torch.Tensor:
+        logger.info(f'Init center: {center}')
+        if center:
+            weight_3d = torch.zeros(*weight_2d.shape)
+            weight_3d = weight_3d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
+            middle_idx = time_dim // 2
+            weight_3d[:, :, middle_idx, :, :] = weight_2d
+        else:
+            weight_3d = weight_2d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
+            weight_3d = weight_3d / time_dim
+        return weight_3d
+
+    def _load_pretrained(self, pretrained: str = None) -> None:
+        """Load ImageNet-1K pretrained model.
+
+        The model is pretrained with ImageNet-1K.
+        https://github.com/Sense-X/UniFormer
+
+        Args:
+            pretrained (str): Model name of ImageNet-1K pretrained model.
+                Defaults to None.
+        """
+        if pretrained is not None:
+            model_path = _MODELS[pretrained]
+            logger.info(f'Load ImageNet pretrained model from {model_path}')
+            state_dict = _load_checkpoint(model_path, map_location='cpu')
+            state_dict_3d = self.state_dict()
+            for k in state_dict.keys():
+                if k in state_dict_3d.keys(
+                ) and state_dict[k].shape != state_dict_3d[k].shape:
+                    if len(state_dict_3d[k].shape) <= 2:
+                        logger.info(f'Ignore: {k}')
+                        continue
+                    logger.info(f'Inflate: {k}, {state_dict[k].shape}' +
+                                f' => {state_dict_3d[k].shape}')
+                    time_dim = state_dict_3d[k].shape[2]
+                    state_dict[k] = self._inflate_weight(
+                        state_dict[k], time_dim)
+            self.load_state_dict(state_dict, strict=False)
+
+    def init_weights(self):
+        """Initialize the weights in backbone."""
+        if self.pretrained2d:
+            logger = MMLogger.get_current_instance()
+            logger.info(f'load model from: {self.pretrained}')
+            self._load_pretrained(self.pretrained)
+        else:
+            if self.pretrained:
+                self.init_cfg = dict(
+                    type='Pretrained', checkpoint=self.pretrained)
+            super().init_weights()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed1(x)
+        x = self.pos_drop(x)
+        for blk in self.blocks1:
+            x = blk(x)
+        x = self.patch_embed2(x)
+        for blk in self.blocks2:
+            x = blk(x)
+        x = self.patch_embed3(x)
+        for blk in self.blocks3:
+            x = blk(x)
+        x = self.patch_embed4(x)
+        for blk in self.blocks4:
+            x = blk(x)
+        x = self.norm(x)
+        return x
diff --git a/mmaction/models/backbones/uniformerv2.py b/mmaction/models/backbones/uniformerv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d188da848af855166a64f0074ef77c95014b0b6
--- /dev/null
+++ b/mmaction/models/backbones/uniformerv2.py
@@ -0,0 +1,597 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from collections import OrderedDict
+from typing import Dict, List, Optional, Union
+
+import torch
+from mmcv.cnn.bricks import DropPath
+from mmengine.logging import MMLogger
+from mmengine.model import BaseModule, ModuleList
+from mmengine.runner.checkpoint import _load_checkpoint
+from torch import nn
+
+from mmaction.registry import MODELS
+
+logger = MMLogger.get_current_instance()
+
+MODEL_PATH = 'https://download.openmmlab.com/mmaction/v1.0/recognition'
+_MODELS = {
+    'ViT-B/16':
+    os.path.join(MODEL_PATH, 'uniformerv2/clipVisualEncoder',
+                 'vit-base-p16-res224_clip-rgb_20221219-b8a5da86.pth'),
+    'ViT-L/14':
+    os.path.join(MODEL_PATH, 'uniformerv2/clipVisualEncoder',
+                 'vit-large-p14-res224_clip-rgb_20221219-9de7543e.pth'),
+    'ViT-L/14_336':
+    os.path.join(MODEL_PATH, 'uniformerv2/clipVisualEncoder',
+                 'vit-large-p14-res336_clip-rgb_20221219-d370f9e5.pth'),
+}
+
+
+class QuickGELU(BaseModule):
+    """Quick GELU function. Forked from https://github.com/openai/CLIP/blob/d50
+    d76daa670286dd6cacf3bcd80b5e4823fc8e1/clip/model.py.
+
+    Args:
+        x (torch.Tensor): The input features of shape :math:`(B, N, C)`.
+    """
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.sigmoid(1.702 * x)
+
+
+class Local_MHRA(BaseModule):
+    """Local MHRA.
+
+    Args:
+        d_model (int): Number of input channels.
+        dw_reduction (float): Downsample ratio of input channels.
+            Defaults to 1.5.
+        pos_kernel_size (int): Kernel size of local MHRA.
+            Defaults to 3.
+        init_cfg (dict, optional): The config of weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        d_model: int,
+        dw_reduction: float = 1.5,
+        pos_kernel_size: int = 3,
+        init_cfg: Optional[dict] = None,
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        padding = pos_kernel_size // 2
+        re_d_model = int(d_model // dw_reduction)
+        self.pos_embed = nn.Sequential(
+            nn.BatchNorm3d(d_model),
+            nn.Conv3d(d_model, re_d_model, kernel_size=1, stride=1, padding=0),
+            nn.Conv3d(
+                re_d_model,
+                re_d_model,
+                kernel_size=(pos_kernel_size, 1, 1),
+                stride=(1, 1, 1),
+                padding=(padding, 0, 0),
+                groups=re_d_model),
+            nn.Conv3d(re_d_model, d_model, kernel_size=1, stride=1, padding=0),
+        )
+
+        # init zero
+        logger.info('Init zero for Conv in pos_emb')
+        nn.init.constant_(self.pos_embed[3].weight, 0)
+        nn.init.constant_(self.pos_embed[3].bias, 0)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.pos_embed(x)
+
+
+class ResidualAttentionBlock(BaseModule):
+    """Local UniBlock.
+
+    Args:
+        d_model (int): Number of input channels.
+        n_head (int): Number of attention head.
+        drop_path (float): Stochastic depth rate.
+            Defaults to 0.0.
+        dw_reduction (float): Downsample ratio of input channels.
+            Defaults to 1.5.
+        no_lmhra (bool): Whether removing local MHRA.
+            Defaults to False.
+        double_lmhra (bool): Whether using double local MHRA.
+            Defaults to True.
+        init_cfg (dict, optional): The config of weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        drop_path: float = 0.0,
+        dw_reduction: float = 1.5,
+        no_lmhra: bool = False,
+        double_lmhra: bool = True,
+        init_cfg: Optional[dict] = None,
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.n_head = n_head
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        logger.info(f'Drop path rate: {drop_path}')
+
+        self.no_lmhra = no_lmhra
+        self.double_lmhra = double_lmhra
+        logger.info(f'No L_MHRA: {no_lmhra}')
+        logger.info(f'Double L_MHRA: {double_lmhra}')
+        if not no_lmhra:
+            self.lmhra1 = Local_MHRA(d_model, dw_reduction=dw_reduction)
+            if double_lmhra:
+                self.lmhra2 = Local_MHRA(d_model, dw_reduction=dw_reduction)
+
+        # spatial
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = nn.LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = nn.LayerNorm(d_model)
+
+    def attention(self, x: torch.Tensor) -> torch.Tensor:
+        return self.attn(x, x, x, need_weights=False, attn_mask=None)[0]
+
+    def forward(self, x: torch.Tensor, T: int = 8) -> torch.Tensor:
+        # x: 1+HW, NT, C
+        if not self.no_lmhra:
+            # Local MHRA
+            tmp_x = x[1:, :, :]
+            L, NT, C = tmp_x.shape
+            N = NT // T
+            H = W = int(L**0.5)
+            tmp_x = tmp_x.view(H, W, N, T, C).permute(2, 4, 3, 0,
+                                                      1).contiguous()
+            tmp_x = tmp_x + self.drop_path(self.lmhra1(tmp_x))
+            tmp_x = tmp_x.view(N, C, T,
+                               L).permute(3, 0, 2,
+                                          1).contiguous().view(L, NT, C)
+            x = torch.cat([x[:1, :, :], tmp_x], dim=0)
+        # MHSA
+        x = x + self.drop_path(self.attention(self.ln_1(x)))
+        # Local MHRA
+        if not self.no_lmhra and self.double_lmhra:
+            tmp_x = x[1:, :, :]
+            tmp_x = tmp_x.view(H, W, N, T, C).permute(2, 4, 3, 0,
+                                                      1).contiguous()
+            tmp_x = tmp_x + self.drop_path(self.lmhra2(tmp_x))
+            tmp_x = tmp_x.view(N, C, T,
+                               L).permute(3, 0, 2,
+                                          1).contiguous().view(L, NT, C)
+            x = torch.cat([x[:1, :, :], tmp_x], dim=0)
+        # FFN
+        x = x + self.drop_path(self.mlp(self.ln_2(x)))
+        return x
+
+
+class Extractor(BaseModule):
+    """Global UniBlock.
+
+    Args:
+        d_model (int): Number of input channels.
+        n_head (int): Number of attention head.
+        mlp_factor (float): Ratio of hidden dimensions in MLP layers.
+            Defaults to 4.0.
+        drop_out (float): Stochastic dropout rate.
+            Defaults to 0.0.
+        drop_path (float): Stochastic depth rate.
+            Defaults to 0.0.
+        init_cfg (dict, optional): The config of weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        mlp_factor: float = 4.0,
+        dropout: float = 0.0,
+        drop_path: float = 0.0,
+        init_cfg: Optional[dict] = None,
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        logger.info(f'Drop path rate: {drop_path}')
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = nn.LayerNorm(d_model)
+        d_mlp = round(mlp_factor * d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_mlp)),
+                         ('gelu', QuickGELU()),
+                         ('dropout', nn.Dropout(dropout)),
+                         ('c_proj', nn.Linear(d_mlp, d_model))]))
+        self.ln_2 = nn.LayerNorm(d_model)
+        self.ln_3 = nn.LayerNorm(d_model)
+
+        # zero init
+        nn.init.xavier_uniform_(self.attn.in_proj_weight)
+        nn.init.constant_(self.attn.out_proj.weight, 0.)
+        nn.init.constant_(self.attn.out_proj.bias, 0.)
+        nn.init.xavier_uniform_(self.mlp[0].weight)
+        nn.init.constant_(self.mlp[-1].weight, 0.)
+        nn.init.constant_(self.mlp[-1].bias, 0.)
+
+    def attention(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        d_model = self.ln_1.weight.size(0)
+        q = (x @ self.attn.in_proj_weight[:d_model].T
+             ) + self.attn.in_proj_bias[:d_model]
+
+        k = (y @ self.attn.in_proj_weight[d_model:-d_model].T
+             ) + self.attn.in_proj_bias[d_model:-d_model]
+        v = (y @ self.attn.in_proj_weight[-d_model:].T
+             ) + self.attn.in_proj_bias[-d_model:]
+        Tx, Ty, N = q.size(0), k.size(0), q.size(1)
+        q = q.view(Tx, N, self.attn.num_heads,
+                   self.attn.head_dim).permute(1, 2, 0, 3)
+        k = k.view(Ty, N, self.attn.num_heads,
+                   self.attn.head_dim).permute(1, 2, 0, 3)
+        v = v.view(Ty, N, self.attn.num_heads,
+                   self.attn.head_dim).permute(1, 2, 0, 3)
+        aff = (q @ k.transpose(-2, -1) / (self.attn.head_dim**0.5))
+
+        aff = aff.softmax(dim=-1)
+        out = aff @ v
+        out = out.permute(2, 0, 1, 3).flatten(2)
+        out = self.attn.out_proj(out)
+        return out
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        x = x + self.drop_path(self.attention(self.ln_1(x), self.ln_3(y)))
+        x = x + self.drop_path(self.mlp(self.ln_2(x)))
+        return x
+
+
+class Transformer(BaseModule):
+    """Backbone:
+
+    Args:
+        width (int): Number of input channels in local UniBlock.
+        layers (int): Number of layers of local UniBlock.
+        heads (int): Number of attention head in local UniBlock.
+        backbone_drop_path_rate (float): Stochastic depth rate
+            in local UniBlock. Defaults to 0.0.
+        t_size (int): Number of temporal dimension after patch embedding.
+            Defaults to 8.
+        dw_reduction (float): Downsample ratio of input channels in local MHRA.
+            Defaults to 1.5.
+        no_lmhra (bool): Whether removing local MHRA in local UniBlock.
+            Defaults to False.
+        double_lmhra (bool): Whether using double local MHRA
+            in local UniBlock. Defaults to True.
+        return_list (List[int]): Layer index of input features
+            for global UniBlock. Defaults to [8, 9, 10, 11].
+        n_dim (int): Number of layers of global UniBlock.
+            Defaults to 4.
+        n_dim (int): Number of layers of global UniBlock.
+            Defaults to 4.
+        n_dim (int): Number of input channels in global UniBlock.
+            Defaults to 768.
+        n_head (int): Number of attention head in global UniBlock.
+            Defaults to 12.
+        mlp_factor (float): Ratio of hidden dimensions in MLP layers
+            in global UniBlock. Defaults to 4.0.
+        drop_path_rate (float): Stochastic depth rate in global UniBlock.
+            Defaults to 0.0.
+        mlp_dropout (List[float]): Stochastic dropout rate in each MLP layer
+            in global UniBlock. Defaults to [0.5, 0.5, 0.5, 0.5].
+        init_cfg (dict, optional): The config of weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        width: int,
+        layers: int,
+        heads: int,
+        backbone_drop_path_rate: float = 0.,
+        t_size: int = 8,
+        dw_reduction: float = 1.5,
+        no_lmhra: bool = True,
+        double_lmhra: bool = False,
+        return_list: List[int] = [8, 9, 10, 11],
+        n_layers: int = 4,
+        n_dim: int = 768,
+        n_head: int = 12,
+        mlp_factor: float = 4.0,
+        drop_path_rate: float = 0.,
+        mlp_dropout: List[float] = [0.5, 0.5, 0.5, 0.5],
+        init_cfg: Optional[dict] = None,
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.T = t_size
+        self.return_list = return_list
+        # backbone
+        b_dpr = [
+            x.item()
+            for x in torch.linspace(0, backbone_drop_path_rate, layers)
+        ]
+        self.resblocks = ModuleList([
+            ResidualAttentionBlock(
+                width,
+                heads,
+                drop_path=b_dpr[i],
+                dw_reduction=dw_reduction,
+                no_lmhra=no_lmhra,
+                double_lmhra=double_lmhra,
+            ) for i in range(layers)
+        ])
+
+        # global block
+        assert n_layers == len(return_list)
+        self.temporal_cls_token = nn.Parameter(torch.zeros(1, 1, n_dim))
+        self.dpe = ModuleList([
+            nn.Conv3d(
+                n_dim,
+                n_dim,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=True,
+                groups=n_dim) for _ in range(n_layers)
+        ])
+        for m in self.dpe:
+            nn.init.constant_(m.bias, 0.)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, n_layers)]
+        self.dec = ModuleList([
+            Extractor(
+                n_dim,
+                n_head,
+                mlp_factor=mlp_factor,
+                dropout=mlp_dropout[i],
+                drop_path=dpr[i],
+            ) for i in range(n_layers)
+        ])
+        # weight sum
+        self.norm = nn.LayerNorm(n_dim)
+        self.balance = nn.Parameter(torch.zeros((n_dim)))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        T_down = self.T
+        L, NT, C = x.shape
+        N = NT // T_down
+        H = W = int((L - 1)**0.5)
+        cls_token = self.temporal_cls_token.repeat(1, N, 1)
+
+        j = -1
+        for i, resblock in enumerate(self.resblocks):
+            x = resblock(x, T_down)
+            if i in self.return_list:
+                j += 1
+                tmp_x = x.clone()
+                tmp_x = tmp_x.view(L, N, T_down, C)
+                # dpe
+                _, tmp_feats = tmp_x[:1], tmp_x[1:]
+                tmp_feats = tmp_feats.permute(1, 3, 2,
+                                              0).reshape(N, C, T_down, H, W)
+                tmp_feats = self.dpe[j](tmp_feats.clone()).view(
+                    N, C, T_down, L - 1).permute(3, 0, 2, 1).contiguous()
+                tmp_x[1:] = tmp_x[1:] + tmp_feats
+                # global block
+                tmp_x = tmp_x.permute(2, 0, 1, 3).flatten(0, 1)  # T * L, N, C
+                cls_token = self.dec[j](cls_token, tmp_x)
+
+        weight = self.sigmoid(self.balance)
+        residual = x.view(L, N, T_down, C)[0].mean(1)  # L, N, T, C
+        out = self.norm((1 - weight) * cls_token[0, :, :] + weight * residual)
+        return out
+
+
+@MODELS.register_module()
+class UniFormerV2(BaseModule):
+    """UniFormerV2:
+
+    A pytorch implement of: `UniFormerV2: Spatiotemporal
+    Learning by Arming Image ViTs with Video UniFormer
+    <https://arxiv.org/abs/2211.09552>`
+
+    Args:
+        input_resolution (int): Number of input resolution.
+            Defaults to 224.
+        patch_size (int): Number of patch size.
+            Defaults to 16.
+        width (int): Number of input channels in local UniBlock.
+            Defaults to 768.
+        layers (int): Number of layers of local UniBlock.
+            Defaults to 12.
+        heads (int): Number of attention head in local UniBlock.
+            Defaults to 12.
+        backbone_drop_path_rate (float): Stochastic depth rate
+            in local UniBlock. Defaults to 0.0.
+        t_size (int): Number of temporal dimension after patch embedding.
+            Defaults to 8.
+        temporal_downsample (bool): Whether downsampling temporal dimentison.
+            Defaults to False.
+        dw_reduction (float): Downsample ratio of input channels in local MHRA.
+            Defaults to 1.5.
+        no_lmhra (bool): Whether removing local MHRA in local UniBlock.
+            Defaults to False.
+        double_lmhra (bool): Whether using double local MHRA in local UniBlock.
+            Defaults to True.
+        return_list (List[int]): Layer index of input features
+            for global UniBlock. Defaults to [8, 9, 10, 11].
+        n_dim (int): Number of layers of global UniBlock.
+            Defaults to 4.
+        n_dim (int): Number of layers of global UniBlock.
+            Defaults to 4.
+        n_dim (int): Number of input channels in global UniBlock.
+            Defaults to 768.
+        n_head (int): Number of attention head in global UniBlock.
+            Defaults to 12.
+        mlp_factor (float): Ratio of hidden dimensions in MLP layers
+            in global UniBlock. Defaults to 4.0.
+        drop_path_rate (float): Stochastic depth rate in global UniBlock.
+            Defaults to 0.0.
+        mlp_dropout (List[float]): Stochastic dropout rate in each MLP layer
+            in global UniBlock. Defaults to [0.5, 0.5, 0.5, 0.5].
+        clip_pretrained (bool): Whether to load pretrained CLIP visual encoder.
+            Defaults to True.
+        pretrained (str): Name of pretrained model.
+            Defaults to None.
+        init_cfg (dict or list[dict]): Initialization config dict. Defaults to
+            ``[
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+            ]``.
+    """
+
+    def __init__(
+        self,
+        # backbone
+        input_resolution: int = 224,
+        patch_size: int = 16,
+        width: int = 768,
+        layers: int = 12,
+        heads: int = 12,
+        backbone_drop_path_rate: float = 0.,
+        t_size: int = 8,
+        kernel_size: int = 3,
+        dw_reduction: float = 1.5,
+        temporal_downsample: bool = False,
+        no_lmhra: bool = True,
+        double_lmhra: bool = False,
+        # global block
+        return_list: List[int] = [8, 9, 10, 11],
+        n_layers: int = 4,
+        n_dim: int = 768,
+        n_head: int = 12,
+        mlp_factor: float = 4.0,
+        drop_path_rate: float = 0.,
+        mlp_dropout: List[float] = [0.5, 0.5, 0.5, 0.5],
+        # pretrain
+        clip_pretrained: bool = True,
+        pretrained: Optional[str] = None,
+        init_cfg: Optional[Union[Dict, List[Dict]]] = [
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.pretrained = pretrained
+        self.clip_pretrained = clip_pretrained
+        self.input_resolution = input_resolution
+        padding = (kernel_size - 1) // 2
+        if temporal_downsample:
+            self.conv1 = nn.Conv3d(
+                3,
+                width, (kernel_size, patch_size, patch_size),
+                (2, patch_size, patch_size), (padding, 0, 0),
+                bias=False)
+            t_size = t_size // 2
+        else:
+            self.conv1 = nn.Conv3d(
+                3,
+                width, (1, patch_size, patch_size),
+                (1, patch_size, patch_size), (0, 0, 0),
+                bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = nn.LayerNorm(width)
+
+        self.transformer = Transformer(
+            width,
+            layers,
+            heads,
+            dw_reduction=dw_reduction,
+            backbone_drop_path_rate=backbone_drop_path_rate,
+            t_size=t_size,
+            no_lmhra=no_lmhra,
+            double_lmhra=double_lmhra,
+            return_list=return_list,
+            n_layers=n_layers,
+            n_dim=n_dim,
+            n_head=n_head,
+            mlp_factor=mlp_factor,
+            drop_path_rate=drop_path_rate,
+            mlp_dropout=mlp_dropout,
+        )
+
+    def _inflate_weight(self,
+                        weight_2d: torch.Tensor,
+                        time_dim: int,
+                        center: bool = True) -> torch.Tensor:
+        logger.info(f'Init center: {center}')
+        if center:
+            weight_3d = torch.zeros(*weight_2d.shape)
+            weight_3d = weight_3d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
+            middle_idx = time_dim // 2
+            weight_3d[:, :, middle_idx, :, :] = weight_2d
+        else:
+            weight_3d = weight_2d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
+            weight_3d = weight_3d / time_dim
+        return weight_3d
+
+    def _load_pretrained(self, pretrained: str = None) -> None:
+        """Load CLIP pretrained visual encoder.
+
+        The visual encoder is extracted from CLIP.
+        https://github.com/openai/CLIP
+
+        Args:
+            pretrained (str): Model name of pretrained CLIP visual encoder.
+                Defaults to None.
+        """
+        assert pretrained is not None, \
+            'please specify clip pretraied checkpoint'
+
+        model_path = _MODELS[pretrained]
+        logger.info(f'Load CLIP pretrained model from {model_path}')
+        state_dict = _load_checkpoint(model_path, map_location='cpu')
+        state_dict_3d = self.state_dict()
+        for k in state_dict.keys():
+            if k in state_dict_3d.keys(
+            ) and state_dict[k].shape != state_dict_3d[k].shape:
+                if len(state_dict_3d[k].shape) <= 2:
+                    logger.info(f'Ignore: {k}')
+                    continue
+                logger.info(f'Inflate: {k}, {state_dict[k].shape}' +
+                            f' => {state_dict_3d[k].shape}')
+                time_dim = state_dict_3d[k].shape[2]
+                state_dict[k] = self._inflate_weight(state_dict[k], time_dim)
+        self.load_state_dict(state_dict, strict=False)
+
+    def init_weights(self):
+        """Initialize the weights in backbone."""
+        if self.clip_pretrained:
+            logger = MMLogger.get_current_instance()
+            logger.info(f'load model from: {self.pretrained}')
+            self._load_pretrained(self.pretrained)
+        else:
+            if self.pretrained:
+                self.init_cfg = dict(
+                    type='Pretrained', checkpoint=self.pretrained)
+            super().init_weights()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        N, C, T, H, W = x.shape
+        x = x.permute(0, 2, 3, 4, 1).reshape(N * T, H * W, C)
+
+        x = torch.cat([
+            self.class_embedding.to(x.dtype) + torch.zeros(
+                x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x
+        ],
+                      dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        out = self.transformer(x)
+        return out
diff --git a/mmaction/models/backbones/vit_mae.py b/mmaction/models/backbones/vit_mae.py
new file mode 100644
index 0000000000000000000000000000000000000000..03111d61ce3ed57f0d3806b5fab9b1fa794b5a90
--- /dev/null
+++ b/mmaction/models/backbones/vit_mae.py
@@ -0,0 +1,383 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks import DropPath
+from mmcv.cnn.bricks.transformer import FFN, PatchEmbed
+from mmengine.model import BaseModule, ModuleList
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType, OptConfigType
+
+
+class Attention(BaseModule):
+    """Multi-head Self-attention.
+
+    Args:
+        embed_dims (int): Dimensions of embedding.
+        num_heads (int): Number of parallel attention heads.
+        qkv_bias (bool): If True, add a learnable bias to q and v.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        attn_drop_rate (float): Dropout ratio of attention weight.
+            Defaults to 0.
+        drop_rate (float): Dropout ratio of output. Defaults to 0.
+        init_cfg (dict or ConfigDict, optional): The Config
+            for initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 num_heads: int = 8,
+                 qkv_bias: bool = True,
+                 qk_scale: Optional[float] = None,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+
+        self.scale = qk_scale or head_embed_dims**-0.5
+
+        if qkv_bias:
+            self._init_qv_bias()
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=False)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(drop_rate)
+
+    def _init_qv_bias(self) -> None:
+        self.q_bias = nn.Parameter(torch.zeros(self.embed_dims))
+        self.v_bias = nn.Parameter(torch.zeros(self.embed_dims))
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data with size of (B, N, C).
+        Returns:
+            Tensor: The output of the attention block, same size as inputs.
+        """
+        B, N, C = x.shape
+
+        if hasattr(self, 'q_bias'):
+            k_bias = torch.zeros_like(self.v_bias, requires_grad=False)
+            qkv_bias = torch.cat((self.q_bias, k_bias, self.v_bias))
+            qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        else:
+            qkv = self.qkv(x)
+
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(BaseModule):
+    """The basic block in the Vision Transformer.
+
+    Args:
+        embed_dims (int): Dimensions of embedding.
+        num_heads (int): Number of parallel attention heads.
+        mlp_ratio (int): The ratio between the hidden layer and the
+            input layer in the FFN. Defaults to 4.
+        qkv_bias (bool): If True, add a learnable bias to q and v.
+            Defaults to True.
+        qk_scale (float): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        drop_rate (float): Dropout ratio of output. Defaults to 0.
+        attn_drop_rate (float): Dropout ratio of attention weight.
+            Defaults to 0.
+        drop_path_rate (float): Dropout ratio of the residual branch.
+            Defaults to 0.
+        init_values (float): Value to init the multiplier of the
+            residual branch. Defaults to 0.
+        act_cfg (dict or ConfigDict): Config for activation layer in FFN.
+            Defaults to `dict(type='GELU')`.
+        norm_cfg (dict or ConfigDict): Config for norm layers.
+            Defaults to `dict(type='LN', eps=1e-6)`.
+        init_cfg (dict or ConfigDict, optional): The Config
+            for initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 num_heads: int,
+                 mlp_ratio: int = 4.,
+                 qkv_bias: bool = True,
+                 qk_scale: Optional[float] = None,
+                 drop_rate: float = 0.,
+                 attn_drop_rate: float = 0.,
+                 drop_path_rate: float = 0.,
+                 init_values: float = 0.0,
+                 act_cfg: ConfigType = dict(type='GELU'),
+                 norm_cfg: ConfigType = dict(type='LN', eps=1e-6),
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.attn = Attention(
+            embed_dims,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            drop_rate=drop_rate)
+
+        self.drop_path = nn.Identity()
+        if drop_path_rate > 0.:
+            self.drop_path = DropPath(drop_path_rate)
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        mlp_hidden_dim = int(embed_dims * mlp_ratio)
+        self.mlp = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=mlp_hidden_dim,
+            act_cfg=act_cfg,
+            ffn_drop=drop_rate,
+            add_identity=False)
+
+        self._init_gammas(init_values, embed_dims)
+
+    def _init_gammas(self, init_values: float, dim: int) -> None:
+        if type(init_values) == float and init_values > 0:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones(dim), requires_grad=True)
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones(dim), requires_grad=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data with size of (B, N, C).
+        Returns:
+            Tensor: The output of the transformer block, same size as inputs.
+        """
+        if hasattr(self, 'gamma_1'):
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+def get_sinusoid_encoding(n_position: int, embed_dims: int) -> Tensor:
+    """Generate sinusoid encoding table.
+
+    Sinusoid encoding is a kind of relative position encoding method came from
+    `Attention Is All You Need<https://arxiv.org/abs/1706.03762>`_.
+    Args:
+        n_position (int): The length of the input token.
+        embed_dims (int): The position embedding dimension.
+    Returns:
+        :obj:`torch.FloatTensor`: The sinusoid encoding table of size
+        (1, n_position, embed_dims)
+    """
+
+    vec = torch.arange(embed_dims, dtype=torch.float64)
+    vec = (vec - vec % 2) / embed_dims
+    vec = torch.pow(10000, -vec).view(1, -1)
+
+    sinusoid_table = torch.arange(n_position).view(-1, 1) * vec
+    sinusoid_table[:, 0::2].sin_()  # dim 2i
+    sinusoid_table[:, 1::2].cos_()  # dim 2i+1
+
+    sinusoid_table = sinusoid_table.to(torch.float32)
+
+    return sinusoid_table.unsqueeze(0)
+
+
+@MODELS.register_module()
+class VisionTransformer(BaseModule):
+    """Vision Transformer with support for patch or hybrid CNN input stage. An
+    impl of `VideoMAE: Masked Autoencoders are Data-Efficient Learners for
+    Self-Supervised Video Pre-Training <https://arxiv.org/pdf/2203.12602.pdf>`_
+
+    Args:
+        img_size (int or tuple): Size of input image.
+            Defaults to 224.
+        patch_size (int): Spatial size of one patch. Defaults to 16.
+        in_channels (int): The number of channels of he input.
+            Defaults to 3.
+        embed_dims (int): Dimensions of embedding. Defaults to 768.
+        depth (int): number of blocks in the transformer.
+            Defaults to 12.
+        num_heads (int): Number of parallel attention heads in
+            TransformerCoder. Defaults to 12.
+        mlp_ratio (int): The ratio between the hidden layer and the
+            input layer in the FFN. Defaults to 4.
+        qkv_bias (bool): If True, add a learnable bias to q and v.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        drop_rate (float): Dropout ratio of output. Defaults to 0.
+        attn_drop_rate (float): Dropout ratio of attention weight.
+            Defaults to 0.
+        drop_path_rate (float): Dropout ratio of the residual branch.
+            Defaults to 0.
+        norm_cfg (dict or Configdict): Config for norm layers.
+            Defaults to `dict(type='LN', eps=1e-6)`.
+        init_values (float): Value to init the multiplier of the residual
+            branch. Defaults to 0.
+        use_learnable_pos_emb (bool): If True, use learnable positional
+            embedding, othersize use sinusoid encoding. Defaults to False.
+        num_frames (int): Number of frames in the video. Defaults to 16.
+        tubelet_size (int): Temporal size of one patch. Defaults to 2.
+        use_mean_pooling (bool): If True, take the mean pooling over all
+            positions. Defaults to True.
+        pretrained (str, optional): Name of pretrained model. Default: None.
+        return_feat_map (bool): If True, return the feature in the shape of
+            `[B, C, T, H, W]`. Defaults to False.
+        init_cfg (dict or list[dict]): Initialization config dict. Defaults to
+            ``[
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+            ]``.
+    """
+
+    def __init__(self,
+                 img_size: int = 224,
+                 patch_size: int = 16,
+                 in_channels: int = 3,
+                 embed_dims: int = 768,
+                 depth: int = 12,
+                 num_heads: int = 12,
+                 mlp_ratio: int = 4.,
+                 qkv_bias: bool = True,
+                 qk_scale: int = None,
+                 drop_rate: float = 0.,
+                 attn_drop_rate: float = 0.,
+                 drop_path_rate: float = 0.,
+                 norm_cfg: ConfigType = dict(type='LN', eps=1e-6),
+                 init_values: int = 0.,
+                 use_learnable_pos_emb: bool = False,
+                 num_frames: int = 16,
+                 tubelet_size: int = 2,
+                 use_mean_pooling: int = True,
+                 pretrained: Optional[str] = None,
+                 return_feat_map: bool = False,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = [
+                     dict(
+                         type='TruncNormal', layer='Linear', std=0.02,
+                         bias=0.),
+                     dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+                 ],
+                 **kwargs) -> None:
+
+        if pretrained:
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        super().__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        self.patch_size = patch_size
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv3d',
+            kernel_size=(tubelet_size, patch_size, patch_size),
+            stride=(tubelet_size, patch_size, patch_size),
+            padding=(0, 0, 0),
+            dilation=(1, 1, 1))
+
+        grid_size = img_size // patch_size
+        num_patches = grid_size**2 * (num_frames // tubelet_size)
+        self.grid_size = (grid_size, grid_size)
+
+        if use_learnable_pos_emb:
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches, embed_dims))
+            nn.init.trunc_normal_(self.pos_embed, std=.02)
+        else:
+            # sine-cosine positional embeddings is on the way
+            pos_embed = get_sinusoid_encoding(num_patches, embed_dims)
+            self.register_buffer('pos_embed', pos_embed)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+
+        self.blocks = ModuleList([
+            Block(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=dpr[i],
+                norm_cfg=norm_cfg,
+                init_values=init_values) for i in range(depth)
+        ])
+
+        if use_mean_pooling:
+            self.norm = nn.Identity()
+            self.fc_norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+            self.fc_norm = None
+
+        self.return_feat_map = return_feat_map
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+        Returns:
+            Tensor: The feature of the input
+                samples extracted by the backbone.
+        """
+        b, _, _, h, w = x.shape
+        h //= self.patch_size
+        w //= self.patch_size
+        x = self.patch_embed(x)[0]
+        if (h, w) != self.grid_size:
+            pos_embed = self.pos_embed.reshape(-1, *self.grid_size,
+                                               self.embed_dims)
+            pos_embed = pos_embed.permute(0, 3, 1, 2)
+            pos_embed = F.interpolate(
+                pos_embed, size=(h, w), mode='bicubic', align_corners=False)
+            pos_embed = pos_embed.permute(0, 2, 3, 1).flatten(1, 2)
+            pos_embed = pos_embed.reshape(1, -1, self.embed_dims)
+        else:
+            pos_embed = self.pos_embed
+
+        x = x + pos_embed
+        x = self.pos_drop(x)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.norm(x)
+
+        if self.return_feat_map:
+            x = x.reshape(b, -1, h, w, self.embed_dims)
+            x = x.permute(0, 4, 1, 2, 3)
+            return x
+
+        if self.fc_norm is not None:
+            return self.fc_norm(x.mean(1))
+
+        return x[:, 0]
diff --git a/mmaction/models/backbones/x3d.py b/mmaction/models/backbones/x3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9cb0aa0c693ae4c07018f879765dec0df29ce8a
--- /dev/null
+++ b/mmaction/models/backbones/x3d.py
@@ -0,0 +1,533 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, Swish, build_activation_layer
+from mmengine.logging import MMLogger
+from mmengine.model.weight_init import constant_init, kaiming_init
+from mmengine.runner import load_checkpoint
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.registry import MODELS
+
+
+class SEModule(nn.Module):
+
+    def __init__(self, channels, reduction):
+        super().__init__()
+        self.avg_pool = nn.AdaptiveAvgPool3d(1)
+        self.bottleneck = self._round_width(channels, reduction)
+        self.fc1 = nn.Conv3d(
+            channels, self.bottleneck, kernel_size=1, padding=0)
+        self.relu = nn.ReLU()
+        self.fc2 = nn.Conv3d(
+            self.bottleneck, channels, kernel_size=1, padding=0)
+        self.sigmoid = nn.Sigmoid()
+
+    @staticmethod
+    def _round_width(width, multiplier, min_width=8, divisor=8):
+        """Round width of filters based on width multiplier."""
+        width *= multiplier
+        min_width = min_width or divisor
+        width_out = max(min_width,
+                        int(width + divisor / 2) // divisor * divisor)
+        if width_out < 0.9 * width:
+            width_out += divisor
+        return int(width_out)
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+
+        Returns:
+            Tensor: The output of the module.
+        """
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return module_input * x
+
+
+class BlockX3D(nn.Module):
+    """BlockX3D 3d building block for X3D.
+
+    Args:
+        inplanes (int): Number of channels for the input in first conv3d layer.
+        planes (int): Number of channels produced by some norm/conv3d layers.
+        outplanes (int): Number of channels produced by final the conv3d layer.
+        spatial_stride (int): Spatial stride in the conv3d layer. Default: 1.
+        downsample (nn.Module | None): Downsample layer. Default: None.
+        se_ratio (float | None): The reduction ratio of squeeze and excitation
+            unit. If set as None, it means not using SE unit. Default: None.
+        use_swish (bool): Whether to use swish as the activation function
+            before and after the 3x3x3 conv. Default: True.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: ``dict(type='Conv3d')``.
+        norm_cfg (dict): Config for norm layers. required keys are ``type``,
+            Default: ``dict(type='BN3d')``.
+        act_cfg (dict): Config dict for activation layer.
+            Default: ``dict(type='ReLU')``.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 outplanes,
+                 spatial_stride=1,
+                 downsample=None,
+                 se_ratio=None,
+                 use_swish=True,
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        super().__init__()
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.outplanes = outplanes
+        self.spatial_stride = spatial_stride
+        self.downsample = downsample
+        self.se_ratio = se_ratio
+        self.use_swish = use_swish
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.act_cfg_swish = dict(type='Swish')
+        self.with_cp = with_cp
+
+        self.conv1 = ConvModule(
+            in_channels=inplanes,
+            out_channels=planes,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        # Here we use the channel-wise conv
+        self.conv2 = ConvModule(
+            in_channels=planes,
+            out_channels=planes,
+            kernel_size=3,
+            stride=(1, self.spatial_stride, self.spatial_stride),
+            padding=1,
+            groups=planes,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+
+        self.swish = Swish()
+
+        self.conv3 = ConvModule(
+            in_channels=planes,
+            out_channels=outplanes,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+
+        if self.se_ratio is not None:
+            self.se_module = SEModule(planes, self.se_ratio)
+
+        self.relu = build_activation_layer(self.act_cfg)
+
+    def forward(self, x):
+        """Defines the computation performed at every call."""
+
+        def _inner_forward(x):
+            """Forward wrapper for utilizing checkpoint."""
+            identity = x
+
+            out = self.conv1(x)
+            out = self.conv2(out)
+            if self.se_ratio is not None:
+                out = self.se_module(out)
+
+            out = self.swish(out)
+
+            out = self.conv3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out = out + identity
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+        out = self.relu(out)
+        return out
+
+
+# We do not support initialize with 2D pretrain weight for X3D
+@MODELS.register_module()
+class X3D(nn.Module):
+    """X3D backbone. https://arxiv.org/pdf/2004.04730.pdf.
+
+    Args:
+        gamma_w (float): Global channel width expansion factor. Default: 1.
+        gamma_b (float): Bottleneck channel width expansion factor. Default: 1.
+        gamma_d (float): Network depth expansion factor. Default: 1.
+        pretrained (str | None): Name of pretrained model. Default: None.
+        in_channels (int): Channel num of input features. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        spatial_strides (Sequence[int]):
+            Spatial strides of residual blocks of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        frozen_stages (int): Stages to be frozen (all param fixed). If set to
+            -1, it means not freezing any parameters. Default: -1.
+        se_style (str): The style of inserting SE modules into BlockX3D, 'half'
+            denotes insert into half of the blocks, while 'all' denotes insert
+            into all blocks. Default: 'half'.
+        se_ratio (float | None): The reduction ratio of squeeze and excitation
+            unit. If set as None, it means not using SE unit. Default: 1 / 16.
+        use_swish (bool): Whether to use swish as the activation function
+            before and after the 3x3x3 conv. Default: True.
+        conv_cfg (dict): Config for conv layers. required keys are ``type``
+            Default: ``dict(type='Conv3d')``.
+        norm_cfg (dict): Config for norm layers. required keys are ``type`` and
+            ``requires_grad``.
+            Default: ``dict(type='BN3d', requires_grad=True)``.
+        act_cfg (dict): Config dict for activation layer.
+            Default: ``dict(type='ReLU', inplace=True)``.
+        norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze
+            running stats (mean and var). Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool):
+            Whether to use zero initialization for residual block,
+            Default: True.
+        kwargs (dict, optional): Key arguments for "make_res_layer".
+    """
+
+    def __init__(self,
+                 gamma_w=1.0,
+                 gamma_b=1.0,
+                 gamma_d=1.0,
+                 pretrained=None,
+                 in_channels=3,
+                 num_stages=4,
+                 spatial_strides=(2, 2, 2, 2),
+                 frozen_stages=-1,
+                 se_style='half',
+                 se_ratio=1 / 16,
+                 use_swish=True,
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d', requires_grad=True),
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 **kwargs):
+        super().__init__()
+        self.gamma_w = gamma_w
+        self.gamma_b = gamma_b
+        self.gamma_d = gamma_d
+
+        self.pretrained = pretrained
+        self.in_channels = in_channels
+        # Hard coded, can be changed by gamma_w
+        self.base_channels = 24
+        self.stage_blocks = [1, 2, 5, 3]
+
+        # apply parameters gamma_w and gamma_d
+        self.base_channels = self._round_width(self.base_channels,
+                                               self.gamma_w)
+
+        self.stage_blocks = [
+            self._round_repeats(x, self.gamma_d) for x in self.stage_blocks
+        ]
+
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.spatial_strides = spatial_strides
+        assert len(spatial_strides) == num_stages
+        self.frozen_stages = frozen_stages
+
+        self.se_style = se_style
+        assert self.se_style in ['all', 'half']
+        self.se_ratio = se_ratio
+        assert (self.se_ratio is None) or (self.se_ratio > 0)
+        self.use_swish = use_swish
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+
+        self.block = BlockX3D
+        self.stage_blocks = self.stage_blocks[:num_stages]
+        self.layer_inplanes = self.base_channels
+        self._make_stem_layer()
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            spatial_stride = spatial_strides[i]
+            inplanes = self.base_channels * 2**i
+            planes = int(inplanes * self.gamma_b)
+
+            res_layer = self.make_res_layer(
+                self.block,
+                self.layer_inplanes,
+                inplanes,
+                planes,
+                num_blocks,
+                spatial_stride=spatial_stride,
+                se_style=self.se_style,
+                se_ratio=self.se_ratio,
+                use_swish=self.use_swish,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                act_cfg=self.act_cfg,
+                with_cp=with_cp,
+                **kwargs)
+            self.layer_inplanes = inplanes
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self.feat_dim = self.base_channels * 2**(len(self.stage_blocks) - 1)
+        self.conv5 = ConvModule(
+            self.feat_dim,
+            int(self.feat_dim * self.gamma_b),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.feat_dim = int(self.feat_dim * self.gamma_b)
+
+    @staticmethod
+    def _round_width(width, multiplier, min_depth=8, divisor=8):
+        """Round width of filters based on width multiplier."""
+        if not multiplier:
+            return width
+
+        width *= multiplier
+        min_depth = min_depth or divisor
+        new_filters = max(min_depth,
+                          int(width + divisor / 2) // divisor * divisor)
+        if new_filters < 0.9 * width:
+            new_filters += divisor
+        return int(new_filters)
+
+    @staticmethod
+    def _round_repeats(repeats, multiplier):
+        """Round number of layers based on depth multiplier."""
+        if not multiplier:
+            return repeats
+        return int(math.ceil(multiplier * repeats))
+
+    # the module is parameterized with gamma_b
+    # no temporal_stride
+    def make_res_layer(self,
+                       block,
+                       layer_inplanes,
+                       inplanes,
+                       planes,
+                       blocks,
+                       spatial_stride=1,
+                       se_style='half',
+                       se_ratio=None,
+                       use_swish=True,
+                       norm_cfg=None,
+                       act_cfg=None,
+                       conv_cfg=None,
+                       with_cp=False,
+                       **kwargs):
+        """Build residual layer for ResNet3D.
+
+        Args:
+            block (nn.Module): Residual module to be built.
+            layer_inplanes (int): Number of channels for the input feature
+                of the res layer.
+            inplanes (int): Number of channels for the input feature in each
+                block, which equals to base_channels * gamma_w.
+            planes (int): Number of channels for the output feature in each
+                block, which equals to base_channel * gamma_w * gamma_b.
+            blocks (int): Number of residual blocks.
+            spatial_stride (int): Spatial strides in residual and conv layers.
+                Default: 1.
+            se_style (str): The style of inserting SE modules into BlockX3D,
+                'half' denotes insert into half of the blocks, while 'all'
+                denotes insert into all blocks. Default: 'half'.
+            se_ratio (float | None): The reduction ratio of squeeze and
+                excitation unit. If set as None, it means not using SE unit.
+                Default: None.
+            use_swish (bool): Whether to use swish as the activation function
+                before and after the 3x3x3 conv. Default: True.
+            conv_cfg (dict | None): Config for norm layers. Default: None.
+            norm_cfg (dict | None): Config for norm layers. Default: None.
+            act_cfg (dict | None): Config for activate layers. Default: None.
+            with_cp (bool | None): Use checkpoint or not. Using checkpoint
+                will save some memory while slowing down the training speed.
+                Default: False.
+
+        Returns:
+            nn.Module: A residual layer for the given config.
+        """
+        downsample = None
+        if spatial_stride != 1 or layer_inplanes != inplanes:
+            downsample = ConvModule(
+                layer_inplanes,
+                inplanes,
+                kernel_size=1,
+                stride=(1, spatial_stride, spatial_stride),
+                padding=0,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+
+        use_se = [False] * blocks
+        if self.se_style == 'all':
+            use_se = [True] * blocks
+        elif self.se_style == 'half':
+            use_se = [i % 2 == 0 for i in range(blocks)]
+        else:
+            raise NotImplementedError
+
+        layers = []
+        layers.append(
+            block(
+                layer_inplanes,
+                planes,
+                inplanes,
+                spatial_stride=spatial_stride,
+                downsample=downsample,
+                se_ratio=se_ratio if use_se[0] else None,
+                use_swish=use_swish,
+                norm_cfg=norm_cfg,
+                conv_cfg=conv_cfg,
+                act_cfg=act_cfg,
+                with_cp=with_cp,
+                **kwargs))
+
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    inplanes,
+                    spatial_stride=1,
+                    se_ratio=se_ratio if use_se[i] else None,
+                    use_swish=use_swish,
+                    norm_cfg=norm_cfg,
+                    conv_cfg=conv_cfg,
+                    act_cfg=act_cfg,
+                    with_cp=with_cp,
+                    **kwargs))
+
+        return nn.Sequential(*layers)
+
+    def _make_stem_layer(self):
+        """Construct the stem layers consists of a conv+norm+act module and a
+        pooling layer."""
+        self.conv1_s = ConvModule(
+            self.in_channels,
+            self.base_channels,
+            kernel_size=(1, 3, 3),
+            stride=(1, 2, 2),
+            padding=(0, 1, 1),
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=None,
+            act_cfg=None)
+        self.conv1_t = ConvModule(
+            self.base_channels,
+            self.base_channels,
+            kernel_size=(5, 1, 1),
+            stride=(1, 1, 1),
+            padding=(2, 0, 0),
+            groups=self.base_channels,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _freeze_stages(self):
+        """Prevent all the parameters from being optimized before
+        ``self.frozen_stages``."""
+        if self.frozen_stages >= 0:
+            self.conv1_s.eval()
+            self.conv1_t.eval()
+            for param in self.conv1_s.parameters():
+                param.requires_grad = False
+            for param in self.conv1_t.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            logger.info(f'load model from: {self.pretrained}')
+
+            load_checkpoint(self, self.pretrained, strict=False, logger=logger)
+
+        elif self.pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv3d):
+                    kaiming_init(m)
+                elif isinstance(m, _BatchNorm):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, BlockX3D):
+                        constant_init(m.conv3.bn, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The feature of the input
+            samples extracted by the backbone.
+        """
+        x = self.conv1_s(x)
+        x = self.conv1_t(x)
+        for layer_name in self.res_layers:
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+        x = self.conv5(x)
+        return x
+
+    def train(self, mode=True):
+        """Set the optimization status when training."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmaction/models/common/__init__.py b/mmaction/models/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b91cb4f537d0a2daefccaf015ce9c0b606c38f1
--- /dev/null
+++ b/mmaction/models/common/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .conv2plus1d import Conv2plus1d
+from .conv_audio import ConvAudio
+from .sub_batchnorm3d import SubBatchNorm3D
+from .tam import TAM
+from .transformer import (DividedSpatialAttentionWithNorm,
+                          DividedTemporalAttentionWithNorm, FFNWithNorm)
+
+__all__ = [
+    'Conv2plus1d', 'TAM', 'DividedSpatialAttentionWithNorm',
+    'DividedTemporalAttentionWithNorm', 'FFNWithNorm', 'SubBatchNorm3D',
+    'ConvAudio'
+]
diff --git a/mmaction/models/common/__pycache__/__init__.cpython-312.pyc b/mmaction/models/common/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f80b1c2d91b603e51fbaae1c650e792fac798e2b
Binary files /dev/null and b/mmaction/models/common/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/models/common/__pycache__/conv2plus1d.cpython-312.pyc b/mmaction/models/common/__pycache__/conv2plus1d.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d312346b7b85e24665d5d1499b2427fd4f39a7b
Binary files /dev/null and b/mmaction/models/common/__pycache__/conv2plus1d.cpython-312.pyc differ
diff --git a/mmaction/models/common/__pycache__/conv_audio.cpython-312.pyc b/mmaction/models/common/__pycache__/conv_audio.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53c25396255bd4878ae7eacae12aad409abeff23
Binary files /dev/null and b/mmaction/models/common/__pycache__/conv_audio.cpython-312.pyc differ
diff --git a/mmaction/models/common/__pycache__/sub_batchnorm3d.cpython-312.pyc b/mmaction/models/common/__pycache__/sub_batchnorm3d.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d89906da256a8f7c2896f9acebfa763ac8b35eeb
Binary files /dev/null and b/mmaction/models/common/__pycache__/sub_batchnorm3d.cpython-312.pyc differ
diff --git a/mmaction/models/common/__pycache__/tam.cpython-312.pyc b/mmaction/models/common/__pycache__/tam.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e5d7867b95311cd871372a1f70a3deb1a9373c3
Binary files /dev/null and b/mmaction/models/common/__pycache__/tam.cpython-312.pyc differ
diff --git a/mmaction/models/common/__pycache__/transformer.cpython-312.pyc b/mmaction/models/common/__pycache__/transformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..251c02d8dcb4aa95f998619d2d43707cd4e182f9
Binary files /dev/null and b/mmaction/models/common/__pycache__/transformer.cpython-312.pyc differ
diff --git a/mmaction/models/common/conv2plus1d.py b/mmaction/models/common/conv2plus1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..67e481a90ada086b8b5711ff4407421e1b6cb07b
--- /dev/null
+++ b/mmaction/models/common/conv2plus1d.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from mmengine.model.weight_init import constant_init, kaiming_init
+from torch.nn.modules.utils import _triple
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType
+
+
+@MODELS.register_module()
+class Conv2plus1d(nn.Module):
+    """(2+1)d Conv module for R(2+1)d backbone.
+
+    https://arxiv.org/pdf/1711.11248.pdf.
+
+    Args:
+        in_channels (int): Same as ``nn.Conv3d``.
+        out_channels (int): Same as ``nn.Conv3d``.
+        kernel_size (Union[int, Tuple[int]]): Same as ``nn.Conv3d``.
+        stride (Union[int, Tuple[int]]): Same as ``nn.Conv3d``. Defaults to 1.
+        padding (Union[int, Tuple[int]]): Same as ``nn.Conv3d``. Defaults to 0.
+        dilation (Union[int, Tuple[int]]): Same as ``nn.Conv3d``.
+            Defaults to 1.
+        groups (int): Same as ``nn.Conv3d``. Defaults to 1.
+        bias (Union[bool, str]): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias will be set as True if norm_cfg is None,
+            otherwise False.
+        norm_cfg (Union[dict, ConfigDict]): Config for norm layers.
+            Defaults to ``dict(type='BN3d')``.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int]],
+        stride: Union[int, Tuple[int]] = 1,
+        padding: Union[int, Tuple[int]] = 0,
+        dilation: Union[int, Tuple[int]] = 1,
+        groups: int = 1,
+        bias: Union[bool, str] = True,
+        norm_cfg: ConfigType = dict(type='BN3d')
+    ) -> None:
+        super().__init__()
+
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        assert len(kernel_size) == len(stride) == len(padding) == 3
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.bias = bias
+        self.norm_cfg = norm_cfg
+        self.output_padding = (0, 0, 0)
+        self.transposed = False
+
+        # The middle-plane is calculated according to:
+        # M_i = \floor{\frac{t * d^2 N_i-1 * N_i}
+        #   {d^2 * N_i-1 + t * N_i}}
+        # where d, t are spatial and temporal kernel, and
+        # N_i, N_i-1 are planes
+        # and inplanes. https://arxiv.org/pdf/1711.11248.pdf
+        mid_channels = 3 * (
+            in_channels * out_channels * kernel_size[1] * kernel_size[2])
+        mid_channels /= (
+            in_channels * kernel_size[1] * kernel_size[2] + 3 * out_channels)
+        mid_channels = int(mid_channels)
+
+        self.conv_s = nn.Conv3d(
+            in_channels,
+            mid_channels,
+            kernel_size=(1, kernel_size[1], kernel_size[2]),
+            stride=(1, stride[1], stride[2]),
+            padding=(0, padding[1], padding[2]),
+            bias=bias)
+        _, self.bn_s = build_norm_layer(self.norm_cfg, mid_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv_t = nn.Conv3d(
+            mid_channels,
+            out_channels,
+            kernel_size=(kernel_size[0], 1, 1),
+            stride=(stride[0], 1, 1),
+            padding=(padding[0], 0, 0),
+            bias=bias)
+
+        self.init_weights()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        x = self.conv_s(x)
+        x = self.bn_s(x)
+        x = self.relu(x)
+        x = self.conv_t(x)
+        return x
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        kaiming_init(self.conv_s)
+        kaiming_init(self.conv_t)
+        constant_init(self.bn_s, 1, bias=0)
diff --git a/mmaction/models/common/conv_audio.py b/mmaction/models/common/conv_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..c53aad1d0907e90d3b73fa9f77c840b68a9d8d75
--- /dev/null
+++ b/mmaction/models/common/conv_audio.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model.weight_init import constant_init, kaiming_init
+from torch.nn.modules.utils import _pair
+
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class ConvAudio(nn.Module):
+    """Conv2d module for AudioResNet backbone.
+
+        <https://arxiv.org/abs/2001.08740>`_.
+
+    Args:
+        in_channels (int): Same as ``nn.Conv2d``.
+        out_channels (int): Same as ``nn.Conv2d``.
+        kernel_size (Union[int, Tuple[int]]): Same as ``nn.Conv2d``.
+        op (str): Operation to merge the output of freq
+            and time feature map. Choices are ``sum`` and ``concat``.
+            Defaults to ``concat``.
+        stride (Union[int, Tuple[int]]): Same as ``nn.Conv2d``. Defaults to 1.
+        padding (Union[int, Tuple[int]]): Same as ``nn.Conv2d``. Defaults to 0.
+        dilation (Union[int, Tuple[int]]): Same as ``nn.Conv2d``.
+            Defaults to 1.
+        groups (int): Same as ``nn.Conv2d``. Defaults to 1.
+        bias (Union[bool, str]): If specified as ``auto``, it will be decided
+            by the ``norm_cfg``. Bias will be set as True if ``norm_cfg``
+            is None, otherwise False. Defaults to False.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int]],
+                 op: str = 'concat',
+                 stride: Union[int, Tuple[int]] = 1,
+                 padding: Union[int, Tuple[int]] = 0,
+                 dilation: Union[int, Tuple[int]] = 1,
+                 groups: int = 1,
+                 bias: Union[bool, str] = False) -> None:
+        super().__init__()
+
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        assert op in ['concat', 'sum']
+        self.op = op
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.bias = bias
+        self.output_padding = (0, 0)
+        self.transposed = False
+
+        self.conv_1 = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=(kernel_size[0], 1),
+            stride=stride,
+            padding=(kernel_size[0] // 2, 0),
+            bias=bias,
+            conv_cfg=dict(type='Conv'),
+            norm_cfg=dict(type='BN'),
+            act_cfg=dict(type='ReLU'))
+
+        self.conv_2 = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=(1, kernel_size[1]),
+            stride=stride,
+            padding=(0, kernel_size[1] // 2),
+            bias=bias,
+            conv_cfg=dict(type='Conv'),
+            norm_cfg=dict(type='BN'),
+            act_cfg=dict(type='ReLU'))
+
+        self.init_weights()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        x_1 = self.conv_1(x)
+        x_2 = self.conv_2(x)
+        if self.op == 'concat':
+            out = torch.cat([x_1, x_2], 1)
+        else:
+            out = x_1 + x_2
+        return out
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        kaiming_init(self.conv_1.conv)
+        kaiming_init(self.conv_2.conv)
+        constant_init(self.conv_1.bn, 1, bias=0)
+        constant_init(self.conv_2.bn, 1, bias=0)
diff --git a/mmaction/models/common/sub_batchnorm3d.py b/mmaction/models/common/sub_batchnorm3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..60c7e80d60cb051e9941913f2ef7f448eec8c9a3
--- /dev/null
+++ b/mmaction/models/common/sub_batchnorm3d.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class SubBatchNorm3D(nn.Module):
+    """Sub BatchNorm3d splits the batch dimension into N splits, and run BN on
+    each of them separately (so that the stats are computed on each subset of
+    examples (1/N of batch) independently). During evaluation, it aggregates
+    the stats from all splits into one BN.
+
+    Args:
+        num_features (int): Dimensions of BatchNorm.
+    """
+
+    def __init__(self, num_features, **cfg):
+        super(SubBatchNorm3D, self).__init__()
+
+        self.num_features = num_features
+        self.cfg_ = deepcopy(cfg)
+        self.num_splits = self.cfg_.pop('num_splits', 1)
+        self.num_features_split = self.num_features * self.num_splits
+        # only keep one set of affine params, not in .bn or .split_bn
+        self.cfg_['affine'] = False
+        self.bn = nn.BatchNorm3d(num_features, **self.cfg_)
+        self.split_bn = nn.BatchNorm3d(self.num_features_split, **self.cfg_)
+        self.init_weights(cfg)
+
+    def init_weights(self, cfg):
+        """Initialize weights."""
+        if cfg.get('affine', True):
+            self.weight = torch.nn.Parameter(torch.ones(self.num_features))
+            self.bias = torch.nn.Parameter(torch.zeros(self.num_features))
+            self.affine = True
+        else:
+            self.affine = False
+
+    def _get_aggregated_mean_std(self, means, stds, n):
+        """Calculate aggregated mean and std."""
+        mean = means.view(n, -1).sum(0) / n
+        std = stds.view(n, -1).sum(0) / n + (
+            (means.view(n, -1) - mean)**2).view(n, -1).sum(0) / n
+        return mean.detach(), std.detach()
+
+    def aggregate_stats(self):
+        """Synchronize running_mean, and running_var to self.bn.
+
+        Call this before eval, then call model.eval(); When eval, forward
+        function will call self.bn instead of self.split_bn, During this time
+        the running_mean, and running_var of self.bn has been obtained from
+        self.split_bn.
+        """
+        if self.split_bn.track_running_stats:
+            aggre_func = self._get_aggregated_mean_std
+            self.bn.running_mean.data, self.bn.running_var.data = aggre_func(
+                self.split_bn.running_mean, self.split_bn.running_var,
+                self.num_splits)
+        self.bn.num_batches_tracked = self.split_bn.num_batches_tracked.detach(
+        )
+
+    def forward(self, x):
+        """Defines the computation performed at every call."""
+        if self.training:
+            n, c, t, h, w = x.shape
+            assert n % self.num_splits == 0
+            x = x.view(n // self.num_splits, c * self.num_splits, t, h, w)
+            x = self.split_bn(x)
+            x = x.view(n, c, t, h, w)
+        else:
+            x = self.bn(x)
+        if self.affine:
+            x = x * self.weight.view(-1, 1, 1, 1)
+            x = x + self.bias.view(-1, 1, 1, 1)
+        return x
diff --git a/mmaction/models/common/tam.py b/mmaction/models/common/tam.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ce4bbedb29f1a614c3668de46f32aced44f9496
--- /dev/null
+++ b/mmaction/models/common/tam.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class TAM(nn.Module):
+    """Temporal Adaptive Module(TAM) for TANet.
+
+    This module is proposed in `TAM: TEMPORAL ADAPTIVE MODULE FOR VIDEO
+    RECOGNITION <https://arxiv.org/pdf/2005.06803>`_
+
+    Args:
+        in_channels (int): Channel num of input features.
+        num_segments (int): Number of frame segments.
+        alpha (int): ``alpha`` in the paper and is the ratio of the
+            intermediate channel number to the initial channel number in the
+            global branch. Defaults to 2.
+        adaptive_kernel_size (int): ``K`` in the paper and is the size of the
+            adaptive kernel size in the global branch. Defaults to 3.
+        beta (int): ``beta`` in the paper and is set to control the model
+            complexity in the local branch. Defaults to 4.
+        conv1d_kernel_size (int): Size of the convolution kernel of Conv1d in
+            the local branch. Defaults to 3.
+        adaptive_convolution_stride (int): The first dimension of strides in
+            the adaptive convolution of ``Temporal Adaptive Aggregation``.
+            Defaults to 1.
+        adaptive_convolution_padding (int): The first dimension of paddings in
+            the adaptive convolution of ``Temporal Adaptive Aggregation``.
+            Defaults to 1.
+        init_std (float): Std value for initiation of `nn.Linear`. Defaults to
+            0.001.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 num_segments: int,
+                 alpha: int = 2,
+                 adaptive_kernel_size: int = 3,
+                 beta: int = 4,
+                 conv1d_kernel_size: int = 3,
+                 adaptive_convolution_stride: int = 1,
+                 adaptive_convolution_padding: int = 1,
+                 init_std: float = 0.001) -> None:
+        super().__init__()
+
+        assert beta > 0 and alpha > 0
+        self.in_channels = in_channels
+        self.num_segments = num_segments
+        self.alpha = alpha
+        self.adaptive_kernel_size = adaptive_kernel_size
+        self.beta = beta
+        self.conv1d_kernel_size = conv1d_kernel_size
+        self.adaptive_convolution_stride = adaptive_convolution_stride
+        self.adaptive_convolution_padding = adaptive_convolution_padding
+        self.init_std = init_std
+
+        self.G = nn.Sequential(
+            nn.Linear(num_segments, num_segments * alpha, bias=False),
+            nn.BatchNorm1d(num_segments * alpha), nn.ReLU(inplace=True),
+            nn.Linear(num_segments * alpha, adaptive_kernel_size, bias=False),
+            nn.Softmax(-1))
+
+        self.L = nn.Sequential(
+            nn.Conv1d(
+                in_channels,
+                in_channels // beta,
+                conv1d_kernel_size,
+                stride=1,
+                padding=conv1d_kernel_size // 2,
+                bias=False), nn.BatchNorm1d(in_channels // beta),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels // beta, in_channels, 1, bias=False),
+            nn.Sigmoid())
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        # [n, c, h, w]
+        n, c, h, w = x.size()
+        num_segments = self.num_segments
+        num_batches = n // num_segments
+        assert c == self.in_channels
+
+        # [num_batches, c, num_segments, h, w]
+        x = x.view(num_batches, num_segments, c, h, w)
+        x = x.permute(0, 2, 1, 3, 4).contiguous()
+
+        # [num_batches * c, num_segments, 1, 1]
+        theta_out = F.adaptive_avg_pool2d(
+            x.view(-1, num_segments, h, w), (1, 1))
+
+        # [num_batches * c, 1, adaptive_kernel_size, 1]
+        conv_kernel = self.G(theta_out.view(-1, num_segments)).view(
+            num_batches * c, 1, -1, 1)
+
+        # [num_batches, c, num_segments, 1, 1]
+        local_activation = self.L(theta_out.view(-1, c, num_segments)).view(
+            num_batches, c, num_segments, 1, 1)
+
+        # [num_batches, c, num_segments, h, w]
+        new_x = x * local_activation
+
+        # [1, num_batches * c, num_segments, h * w]
+        y = F.conv2d(
+            new_x.view(1, num_batches * c, num_segments, h * w),
+            conv_kernel,
+            bias=None,
+            stride=(self.adaptive_convolution_stride, 1),
+            padding=(self.adaptive_convolution_padding, 0),
+            groups=num_batches * c)
+
+        # [n, c, h, w]
+        y = y.view(num_batches, c, num_segments, h, w)
+        y = y.permute(0, 2, 1, 3, 4).contiguous().view(n, c, h, w)
+
+        return y
diff --git a/mmaction/models/common/transformer.py b/mmaction/models/common/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5af874f05f5b33ebe8bc57a345153fa2ee6df825
--- /dev/null
+++ b/mmaction/models/common/transformer.py
@@ -0,0 +1,222 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from einops import rearrange
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, build_dropout
+from mmengine.model import BaseModule
+from mmengine.model.weight_init import constant_init
+from mmengine.utils import digit_version
+
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class DividedTemporalAttentionWithNorm(BaseModule):
+    """Temporal Attention in Divided Space Time Attention.
+
+    Args:
+        embed_dims (int): Dimensions of embedding.
+        num_heads (int): Number of parallel attention heads in
+            TransformerCoder.
+        num_frames (int): Number of frames in the video.
+        attn_drop (float): A Dropout layer on attn_output_weights. Defaults to
+            0..
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Defaults to 0..
+        dropout_layer (dict): The dropout_layer used when adding the shortcut.
+            Defaults to `dict(type='DropPath', drop_prob=0.1)`.
+        norm_cfg (dict): Config dict for normalization layer. Defaults to
+            `dict(type='LN')`.
+        init_cfg (dict | None): The Config for initialization. Defaults to
+            None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 num_frames,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=dict(type='DropPath', drop_prob=0.1),
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(init_cfg)
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.num_frames = num_frames
+        self.norm = build_norm_layer(norm_cfg, self.embed_dims)[1]
+
+        if digit_version(torch.__version__) < digit_version('1.9.0'):
+            kwargs.pop('batch_first', None)
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
+                                          **kwargs)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else nn.Identity()
+        self.temporal_fc = nn.Linear(self.embed_dims, self.embed_dims)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize weights."""
+        constant_init(self.temporal_fc, val=0, bias=0)
+
+    def forward(self, query, key=None, value=None, residual=None, **kwargs):
+        """Defines the computation performed at every call."""
+        assert residual is None, (
+            'Always adding the shortcut in the forward function')
+
+        init_cls_token = query[:, 0, :].unsqueeze(1)
+        identity = query_t = query[:, 1:, :]
+
+        # query_t [batch_size, num_patches * num_frames, embed_dims]
+        b, pt, m = query_t.size()
+        p, t = pt // self.num_frames, self.num_frames
+
+        # res_temporal [batch_size * num_patches, num_frames, embed_dims]
+        query_t = self.norm(query_t.reshape(b * p, t, m)).permute(1, 0, 2)
+        res_temporal = self.attn(query_t, query_t, query_t)[0].permute(1, 0, 2)
+        res_temporal = self.dropout_layer(
+            self.proj_drop(res_temporal.contiguous()))
+        res_temporal = self.temporal_fc(res_temporal)
+
+        # res_temporal [batch_size, num_patches * num_frames, embed_dims]
+        res_temporal = res_temporal.reshape(b, p * t, m)
+
+        # ret_value [batch_size, num_patches * num_frames + 1, embed_dims]
+        new_query_t = identity + res_temporal
+        new_query = torch.cat((init_cls_token, new_query_t), 1)
+        return new_query
+
+
+@MODELS.register_module()
+class DividedSpatialAttentionWithNorm(BaseModule):
+    """Spatial Attention in Divided Space Time Attention.
+
+    Args:
+        embed_dims (int): Dimensions of embedding.
+        num_heads (int): Number of parallel attention heads in
+            TransformerCoder.
+        num_frames (int): Number of frames in the video.
+        attn_drop (float): A Dropout layer on attn_output_weights. Defaults to
+            0..
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Defaults to 0..
+        dropout_layer (dict): The dropout_layer used when adding the shortcut.
+            Defaults to `dict(type='DropPath', drop_prob=0.1)`.
+        norm_cfg (dict): Config dict for normalization layer. Defaults to
+            `dict(type='LN')`.
+        init_cfg (dict | None): The Config for initialization. Defaults to
+            None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 num_frames,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=dict(type='DropPath', drop_prob=0.1),
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(init_cfg)
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.num_frames = num_frames
+        self.norm = build_norm_layer(norm_cfg, self.embed_dims)[1]
+        if digit_version(torch.__version__) < digit_version('1.9.0'):
+            kwargs.pop('batch_first', None)
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
+                                          **kwargs)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else nn.Identity()
+
+        self.init_weights()
+
+    def init_weights(self):
+        """init DividedSpatialAttentionWithNorm by default."""
+        pass
+
+    def forward(self, query, key=None, value=None, residual=None, **kwargs):
+        """Defines the computation performed at every call."""
+        assert residual is None, (
+            'Always adding the shortcut in the forward function')
+
+        identity = query
+        init_cls_token = query[:, 0, :].unsqueeze(1)
+        query_s = query[:, 1:, :]
+
+        # query_s [batch_size, num_patches * num_frames, embed_dims]
+        b, pt, m = query_s.size()
+        p, t = pt // self.num_frames, self.num_frames
+
+        # cls_token [batch_size * num_frames, 1, embed_dims]
+        cls_token = init_cls_token.repeat(1, t, 1).reshape(b * t,
+                                                           m).unsqueeze(1)
+
+        # query_s [batch_size * num_frames, num_patches + 1, embed_dims]
+        query_s = rearrange(query_s, 'b (p t) m -> (b t) p m', p=p, t=t)
+        query_s = torch.cat((cls_token, query_s), 1)
+
+        # res_spatial [batch_size * num_frames, num_patches + 1, embed_dims]
+        query_s = self.norm(query_s).permute(1, 0, 2)
+        res_spatial = self.attn(query_s, query_s, query_s)[0].permute(1, 0, 2)
+        res_spatial = self.dropout_layer(
+            self.proj_drop(res_spatial.contiguous()))
+
+        # cls_token [batch_size, 1, embed_dims]
+        cls_token = res_spatial[:, 0, :].reshape(b, t, m)
+        cls_token = torch.mean(cls_token, 1, True)
+
+        # res_spatial [batch_size * num_frames, num_patches + 1, embed_dims]
+        res_spatial = rearrange(
+            res_spatial[:, 1:, :], '(b t) p m -> b (p t) m', p=p, t=t)
+        res_spatial = torch.cat((cls_token, res_spatial), 1)
+
+        new_query = identity + res_spatial
+        return new_query
+
+
+@MODELS.register_module()
+class FFNWithNorm(FFN):
+    """FFN with pre normalization layer.
+
+    FFNWithNorm is implemented to be compatible with `BaseTransformerLayer`
+    when using `DividedTemporalAttentionWithNorm` and
+    `DividedSpatialAttentionWithNorm`.
+
+    FFNWithNorm has one main difference with FFN:
+
+    - It apply one normalization layer before forwarding the input data to
+        feed-forward networks.
+
+    Args:
+        embed_dims (int): Dimensions of embedding. Defaults to 256.
+        feedforward_channels (int): Hidden dimension of FFNs. Defaults to 1024.
+        num_fcs (int, optional): Number of fully-connected layers in FFNs.
+            Defaults to 2.
+        act_cfg (dict): Config for activate layers.
+            Defaults to `dict(type='ReLU')`
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Defaults to 0..
+        add_residual (bool, optional): Whether to add the
+            residual connection. Defaults to `True`.
+        dropout_layer (dict | None): The dropout_layer used when adding the
+            shortcut. Defaults to None.
+        init_cfg (dict): The Config for initialization. Defaults to None.
+        norm_cfg (dict): Config dict for normalization layer. Defaults to
+            `dict(type='LN')`.
+    """
+
+    def __init__(self, *args, norm_cfg=dict(type='LN'), **kwargs):
+        super().__init__(*args, **kwargs)
+        self.norm = build_norm_layer(norm_cfg, self.embed_dims)[1]
+
+    def forward(self, x, residual=None):
+        """Defines the computation performed at every call."""
+        assert residual is None, ('Cannot apply pre-norm with FFNWithNorm')
+        return super().forward(self.norm(x), x)
diff --git a/mmaction/models/data_preprocessors/__init__.py b/mmaction/models/data_preprocessors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3d8ab6cd02f64b4f4fce318e384d708482425f2
--- /dev/null
+++ b/mmaction/models/data_preprocessors/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_preprocessor import ActionDataPreprocessor
+from .multimodal_data_preprocessor import MultiModalDataPreprocessor
+
+__all__ = ['ActionDataPreprocessor', 'MultiModalDataPreprocessor']
diff --git a/mmaction/models/data_preprocessors/__pycache__/__init__.cpython-312.pyc b/mmaction/models/data_preprocessors/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c52b2a0ec98f626c5901443ac31b0b55067e929
Binary files /dev/null and b/mmaction/models/data_preprocessors/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/models/data_preprocessors/__pycache__/data_preprocessor.cpython-312.pyc b/mmaction/models/data_preprocessors/__pycache__/data_preprocessor.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f94aa48e35e7f1af605c11827229bed50158dbe
Binary files /dev/null and b/mmaction/models/data_preprocessors/__pycache__/data_preprocessor.cpython-312.pyc differ
diff --git a/mmaction/models/data_preprocessors/__pycache__/multimodal_data_preprocessor.cpython-312.pyc b/mmaction/models/data_preprocessors/__pycache__/multimodal_data_preprocessor.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76eb5b781e1243341b0e04475a23b6fda00c2cfe
Binary files /dev/null and b/mmaction/models/data_preprocessors/__pycache__/multimodal_data_preprocessor.cpython-312.pyc differ
diff --git a/mmaction/models/data_preprocessors/data_preprocessor.py b/mmaction/models/data_preprocessors/data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..499cdd730fc53d66dcb5a657f5476f6f25fc8536
--- /dev/null
+++ b/mmaction/models/data_preprocessors/data_preprocessor.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+from mmengine.model import BaseDataPreprocessor, stack_batch
+
+from mmaction.registry import MODELS
+from mmaction.utils import SampleList
+
+
+@MODELS.register_module()
+class ActionDataPreprocessor(BaseDataPreprocessor):
+    """Data pre-processor for action recognition tasks.
+
+    Args:
+        mean (Sequence[float or int], optional): The pixel mean of channels
+            of images or stacked optical flow. Defaults to None.
+        std (Sequence[float or int], optional): The pixel standard deviation
+            of channels of images or stacked optical flow. Defaults to None.
+        to_rgb (bool): Whether to convert image from BGR to RGB.
+            Defaults to False.
+        to_float32 (bool): Whether to convert data to float32.
+            Defaults to True.
+        blending (dict, optional): Config for batch blending.
+            Defaults to None.
+        format_shape (str): Format shape of input data.
+            Defaults to ``'NCHW'``.
+    """
+
+    def __init__(self,
+                 mean: Optional[Sequence[Union[float, int]]] = None,
+                 std: Optional[Sequence[Union[float, int]]] = None,
+                 to_rgb: bool = False,
+                 to_float32: bool = True,
+                 blending: Optional[dict] = None,
+                 format_shape: str = 'NCHW') -> None:
+        super().__init__()
+        self.to_rgb = to_rgb
+        self.to_float32 = to_float32
+        self.format_shape = format_shape
+
+        if mean is not None:
+            assert std is not None, 'To enable the normalization in ' \
+                                    'preprocessing, please specify both ' \
+                                    '`mean` and `std`.'
+            # Enable the normalization in preprocessing.
+            self._enable_normalize = True
+            if self.format_shape == 'NCHW':
+                normalizer_shape = (-1, 1, 1)
+            elif self.format_shape in ['NCTHW', 'MIX2d3d']:
+                normalizer_shape = (-1, 1, 1, 1)
+            else:
+                raise ValueError(f'Invalid format shape: {format_shape}')
+
+            self.register_buffer(
+                'mean',
+                torch.tensor(mean, dtype=torch.float32).view(normalizer_shape),
+                False)
+            self.register_buffer(
+                'std',
+                torch.tensor(std, dtype=torch.float32).view(normalizer_shape),
+                False)
+        else:
+            self._enable_normalize = False
+
+        if blending is not None:
+            self.blending = MODELS.build(blending)
+        else:
+            self.blending = None
+
+    def forward(self,
+                data: Union[dict, Tuple[dict]],
+                training: bool = False) -> Union[dict, Tuple[dict]]:
+        """Perform normalization, padding, bgr2rgb conversion and batch
+        augmentation based on ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict or Tuple[dict]): data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict or Tuple[dict]: Data in the same format as the model input.
+        """
+        data = self.cast_data(data)
+        if isinstance(data, dict):
+            return self.forward_onesample(data, training=training)
+        elif isinstance(data, (tuple, list)):
+            outputs = []
+            for data_sample in data:
+                output = self.forward_onesample(data_sample, training=training)
+                outputs.append(output)
+            return tuple(outputs)
+        else:
+            raise TypeError(f'Unsupported data type: {type(data)}!')
+
+    def forward_onesample(self, data, training: bool = False) -> dict:
+        """Perform normalization, padding, bgr2rgb conversion and batch
+        augmentation on one data sample.
+
+        Args:
+            data (dict): data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        inputs, data_samples = data['inputs'], data['data_samples']
+        inputs, data_samples = self.preprocess(inputs, data_samples, training)
+        data['inputs'] = inputs
+        data['data_samples'] = data_samples
+        return data
+
+    def preprocess(self,
+                   inputs: List[torch.Tensor],
+                   data_samples: SampleList,
+                   training: bool = False) -> Tuple:
+        # --- Pad and stack --
+        batch_inputs = stack_batch(inputs)
+
+        if self.format_shape == 'MIX2d3d':
+            if batch_inputs.ndim == 4:
+                format_shape, view_shape = 'NCHW', (-1, 1, 1)
+            else:
+                format_shape, view_shape = 'NCTHW', None
+        else:
+            format_shape, view_shape = self.format_shape, None
+
+        # ------ To RGB ------
+        if self.to_rgb:
+            if format_shape == 'NCHW':
+                batch_inputs = batch_inputs[..., [2, 1, 0], :, :]
+            elif format_shape == 'NCTHW':
+                batch_inputs = batch_inputs[..., [2, 1, 0], :, :, :]
+            else:
+                raise ValueError(f'Invalid format shape: {format_shape}')
+
+        # -- Normalization ---
+        if self._enable_normalize:
+            if view_shape is None:
+                batch_inputs = (batch_inputs - self.mean) / self.std
+            else:
+                mean = self.mean.view(view_shape)
+                std = self.std.view(view_shape)
+                batch_inputs = (batch_inputs - mean) / std
+        elif self.to_float32:
+            batch_inputs = batch_inputs.to(torch.float32)
+
+        # ----- Blending -----
+        if training and self.blending is not None:
+            batch_inputs, data_samples = self.blending(batch_inputs,
+                                                       data_samples)
+
+        return batch_inputs, data_samples
diff --git a/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py b/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4353c447244251f2000cdf6a1efc8df3135a349
--- /dev/null
+++ b/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+from mmengine.model import BaseDataPreprocessor, ModuleDict
+
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class MultiModalDataPreprocessor(BaseDataPreprocessor):
+    """Multi-Modal data pre-processor for action recognition tasks."""
+
+    def __init__(self, preprocessors: Dict) -> None:
+        super().__init__()
+        self.preprocessors = ModuleDict()
+        for name, pre_cfg in preprocessors.items():
+            assert 'type' in pre_cfg, (
+                'Each data preprocessor should contain the key type, '
+                f'but got {pre_cfg}')
+            self.preprocessors[name] = MODELS.build(pre_cfg)
+
+    def forward(self, data: Dict, training: bool = False) -> Dict:
+        """Preprocesses the data into the model input format.
+
+        Args:
+            data (dict): Data returned by dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        data = self.cast_data(data)
+        inputs, data_samples = data['inputs'], data['data_samples']
+        for modality, modality_data in inputs.items():
+            preprocessor = self.preprocessors[modality]
+            modality_data, data_samples = preprocessor.preprocess(
+                modality_data, data_samples, training)
+            inputs[modality] = modality_data
+
+        data['inputs'] = inputs
+        data['data_samples'] = data_samples
+        return data
diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b03f72f37fe383eb691f00625bdd6f904a9504cf
--- /dev/null
+++ b/mmaction/models/heads/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseHead
+from .feature_head import FeatureHead
+from .gcn_head import GCNHead
+from .i3d_head import I3DHead
+from .mvit_head import MViTHead
+from .omni_head import OmniHead
+from .rgbpose_head import RGBPoseHead
+from .slowfast_head import SlowFastHead
+from .timesformer_head import TimeSformerHead
+from .tpn_head import TPNHead
+from .trn_head import TRNHead
+from .tsm_head import TSMHead
+from .tsn_audio_head import TSNAudioHead
+from .tsn_head import TSNHead
+from .uniformer_head import UniFormerHead
+from .x3d_head import X3DHead
+
+__all__ = [
+    'BaseHead', 'GCNHead', 'I3DHead', 'MViTHead', 'OmniHead', 'SlowFastHead',
+    'TPNHead', 'TRNHead', 'TSMHead', 'TSNAudioHead', 'TSNHead',
+    'TimeSformerHead', 'UniFormerHead', 'RGBPoseHead', 'X3DHead', 'FeatureHead'
+]
diff --git a/mmaction/models/heads/__pycache__/__init__.cpython-312.pyc b/mmaction/models/heads/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04054e9548fb2ececbd703f9032438f20adcc31c
Binary files /dev/null and b/mmaction/models/heads/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/base.cpython-312.pyc b/mmaction/models/heads/__pycache__/base.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd5dec7cacdf0e44fac038d3bdc89e026614bb16
Binary files /dev/null and b/mmaction/models/heads/__pycache__/base.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/feature_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/feature_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd63946b954cbe291129f34e2b0c99e3395ec5e8
Binary files /dev/null and b/mmaction/models/heads/__pycache__/feature_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/gcn_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/gcn_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50a0ce2ed05b7f965d967fabec91c86f72b60e3d
Binary files /dev/null and b/mmaction/models/heads/__pycache__/gcn_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/i3d_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/i3d_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..326c48394919286ded460b2d86648cf4b375060f
Binary files /dev/null and b/mmaction/models/heads/__pycache__/i3d_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/mvit_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/mvit_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95b39dd8fbd0031a36964718c42ebadad9a9c565
Binary files /dev/null and b/mmaction/models/heads/__pycache__/mvit_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/omni_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/omni_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99604b5967a14fc9a382fed933273b0ca1626696
Binary files /dev/null and b/mmaction/models/heads/__pycache__/omni_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/rgbpose_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/rgbpose_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e146d13d4b9a6c1282302d06828363cbc830e3e5
Binary files /dev/null and b/mmaction/models/heads/__pycache__/rgbpose_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/slowfast_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/slowfast_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2330424fcdf2ce88f365b487f9e8a45f07cff30e
Binary files /dev/null and b/mmaction/models/heads/__pycache__/slowfast_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/timesformer_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/timesformer_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a137e24888d7c343dd848bd999e5a3b121f87903
Binary files /dev/null and b/mmaction/models/heads/__pycache__/timesformer_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/tpn_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/tpn_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba99c3bdf5cdf72f9faccde470f6b40ee728555e
Binary files /dev/null and b/mmaction/models/heads/__pycache__/tpn_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/trn_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/trn_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d1a73cc4d8a11d122782bb811e412831b310daf
Binary files /dev/null and b/mmaction/models/heads/__pycache__/trn_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/tsm_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/tsm_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6106f81074ee34b190922b11ce2af9535272d9d7
Binary files /dev/null and b/mmaction/models/heads/__pycache__/tsm_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/tsn_audio_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/tsn_audio_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..629956aa6576fcbe8f864c2b143c65c931e9450a
Binary files /dev/null and b/mmaction/models/heads/__pycache__/tsn_audio_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/tsn_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/tsn_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72f52835cce4c749dcb69bf032887ec4e1a4f8ca
Binary files /dev/null and b/mmaction/models/heads/__pycache__/tsn_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/uniformer_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/uniformer_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cfd308844d020ecfd07747cbf26c9f4ddc7f7e2f
Binary files /dev/null and b/mmaction/models/heads/__pycache__/uniformer_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/__pycache__/x3d_head.cpython-312.pyc b/mmaction/models/heads/__pycache__/x3d_head.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96900ab8997962e8a4316d5bec65e954ec7cb28f
Binary files /dev/null and b/mmaction/models/heads/__pycache__/x3d_head.cpython-312.pyc differ
diff --git a/mmaction/models/heads/base.py b/mmaction/models/heads/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..98ee11ee745fc965fc10d99c46989fad35e7261d
--- /dev/null
+++ b/mmaction/models/heads/base.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+
+from mmaction.evaluation import top_k_accuracy
+from mmaction.registry import MODELS
+from mmaction.utils import ForwardResults, SampleList
+
+
+class AvgConsensus(nn.Module):
+    """Average consensus module.
+
+    Args:
+        dim (int): Decide which dim consensus function to apply.
+            Defaults to 1.
+    """
+
+    def __init__(self, dim: int = 1) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        return x.mean(dim=self.dim, keepdim=True)
+
+
+class BaseHead(BaseModule, metaclass=ABCMeta):
+    """Base class for head.
+
+    All Head should subclass it.
+    All subclass should overwrite:
+    - :meth:`forward`, supporting to forward both for training and testing.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict): Config for building loss.
+            Defaults to ``dict(type='CrossEntropyLoss', loss_weight=1.0)``.
+        multi_class (bool): Determines whether it is a multi-class
+            recognition task. Defaults to False.
+        label_smooth_eps (float): Epsilon used in label smooth.
+            Reference: arxiv.org/abs/1906.02629. Defaults to 0.
+        topk (int or tuple): Top-k accuracy. Defaults to ``(1, 5)``.
+        average_clips (dict, optional): Config for averaging class
+            scores over multiple clips. Defaults to None.
+        init_cfg (dict, optional): Config to control the initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_cls: Dict = dict(
+                     type='CrossEntropyLoss', loss_weight=1.0),
+                 multi_class: bool = False,
+                 label_smooth_eps: float = 0.0,
+                 topk: Union[int, Tuple[int]] = (1, 5),
+                 average_clips: Optional[Dict] = None,
+                 init_cfg: Optional[Dict] = None) -> None:
+        super(BaseHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.loss_cls = MODELS.build(loss_cls)
+        self.multi_class = multi_class
+        self.label_smooth_eps = label_smooth_eps
+        self.average_clips = average_clips
+        assert isinstance(topk, (int, tuple))
+        if isinstance(topk, int):
+            topk = (topk, )
+        for _topk in topk:
+            assert _topk > 0, 'Top-k should be larger than 0'
+        self.topk = topk
+
+    @abstractmethod
+    def forward(self, x, **kwargs) -> ForwardResults:
+        """Defines the computation performed at every call."""
+        raise NotImplementedError
+
+    def loss(self, feats: Union[torch.Tensor, Tuple[torch.Tensor]],
+             data_samples: SampleList, **kwargs) -> Dict:
+        """Perform forward propagation of head and loss calculation on the
+        features of the upstream network.
+
+        Args:
+            feats (torch.Tensor | tuple[torch.Tensor]): Features from
+                upstream network.
+            data_samples (list[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        cls_scores = self(feats, **kwargs)
+        return self.loss_by_feat(cls_scores, data_samples)
+
+    def loss_by_feat(self, cls_scores: torch.Tensor,
+                     data_samples: SampleList) -> Dict:
+        """Calculate the loss based on the features extracted by the head.
+
+        Args:
+            cls_scores (torch.Tensor): Classification prediction results of
+                all class, has shape (batch_size, num_classes).
+            data_samples (list[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        labels = [x.gt_label for x in data_samples]
+        labels = torch.stack(labels).to(cls_scores.device)
+        labels = labels.squeeze()
+
+        losses = dict()
+        if labels.shape == torch.Size([]):
+            labels = labels.unsqueeze(0)
+        elif labels.dim() == 1 and labels.size()[0] == self.num_classes \
+                and cls_scores.size()[0] == 1:
+            # Fix a bug when training with soft labels and batch size is 1.
+            # When using soft labels, `labels` and `cls_score` share the same
+            # shape.
+            labels = labels.unsqueeze(0)
+
+        if cls_scores.size() != labels.size():
+            top_k_acc = top_k_accuracy(cls_scores.detach().cpu().numpy(),
+                                       labels.detach().cpu().numpy(),
+                                       self.topk)
+            for k, a in zip(self.topk, top_k_acc):
+                losses[f'top{k}_acc'] = torch.tensor(
+                    a, device=cls_scores.device)
+        if self.label_smooth_eps != 0:
+            if cls_scores.size() != labels.size():
+                labels = F.one_hot(labels, num_classes=self.num_classes)
+            labels = ((1 - self.label_smooth_eps) * labels +
+                      self.label_smooth_eps / self.num_classes)
+
+        loss_cls = self.loss_cls(cls_scores, labels)
+        # loss_cls may be dictionary or single tensor
+        if isinstance(loss_cls, dict):
+            losses.update(loss_cls)
+        else:
+            losses['loss_cls'] = loss_cls
+        return losses
+
+    def predict(self, feats: Union[torch.Tensor, Tuple[torch.Tensor]],
+                data_samples: SampleList, **kwargs) -> SampleList:
+        """Perform forward propagation of head and predict recognition results
+        on the features of the upstream network.
+
+        Args:
+            feats (torch.Tensor | tuple[torch.Tensor]): Features from
+                upstream network.
+            data_samples (list[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+             list[:obj:`ActionDataSample`]: Recognition results wrapped
+                by :obj:`ActionDataSample`.
+        """
+        cls_scores = self(feats, **kwargs)
+        return self.predict_by_feat(cls_scores, data_samples)
+
+    def predict_by_feat(self, cls_scores: torch.Tensor,
+                        data_samples: SampleList) -> SampleList:
+        """Transform a batch of output features extracted from the head into
+        prediction results.
+
+        Args:
+            cls_scores (torch.Tensor): Classification scores, has a shape
+                (B*num_segs, num_classes)
+            data_samples (list[:obj:`ActionDataSample`]): The
+                annotation data of every samples. It usually includes
+                information such as `gt_label`.
+
+        Returns:
+            List[:obj:`ActionDataSample`]: Recognition results wrapped
+                by :obj:`ActionDataSample`.
+        """
+        num_segs = cls_scores.shape[0] // len(data_samples)
+        cls_scores = self.average_clip(cls_scores, num_segs=num_segs)
+        pred_labels = cls_scores.argmax(dim=-1, keepdim=True).detach()
+
+        for data_sample, score, pred_label in zip(data_samples, cls_scores,
+                                                  pred_labels):
+            data_sample.set_pred_score(score)
+            data_sample.set_pred_label(pred_label)
+        return data_samples
+
+    def average_clip(self,
+                     cls_scores: torch.Tensor,
+                     num_segs: int = 1) -> torch.Tensor:
+        """Averaging class scores over multiple clips.
+
+        Using different averaging types ('score' or 'prob' or None,
+        which defined in test_cfg) to computed the final averaged
+        class score. Only called in test mode.
+
+        Args:
+            cls_scores (torch.Tensor): Class scores to be averaged.
+            num_segs (int): Number of clips for each input sample.
+
+        Returns:
+            torch.Tensor: Averaged class scores.
+        """
+
+        if self.average_clips not in ['score', 'prob', None]:
+            raise ValueError(f'{self.average_clips} is not supported. '
+                             f'Currently supported ones are '
+                             f'["score", "prob", None]')
+
+        batch_size = cls_scores.shape[0]
+        cls_scores = cls_scores.view((batch_size // num_segs, num_segs) +
+                                     cls_scores.shape[1:])
+
+        if self.average_clips is None:
+            return cls_scores
+        elif self.average_clips == 'prob':
+            cls_scores = F.softmax(cls_scores, dim=2).mean(dim=1)
+        elif self.average_clips == 'score':
+            cls_scores = cls_scores.mean(dim=1)
+
+        return cls_scores
diff --git a/mmaction/models/heads/feature_head.py b/mmaction/models/heads/feature_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b010daa65caa2da9588ea0644444fc7fc1dfc97c
--- /dev/null
+++ b/mmaction/models/heads/feature_head.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+
+from mmaction.registry import MODELS
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class FeatureHead(BaseHead):
+    """General head for feature extraction.
+
+    Args:
+        spatial_type (str, optional): Pooling type in spatial dimension.
+            Default: 'avg'. If set to None, means keeping spatial dimension,
+            and for GCN backbone, keeping last two dimension(T, V).
+        temporal_type (str, optional): Pooling type in temporal dimension.
+            Default: 'avg'. If set to None, meanse keeping temporal dimnsion,
+            and for GCN backbone, keeping dimesion M. Please note that the
+            channel order would keep same with the output of backbone,
+            [N, T, C, H, W] for 2D recognizer, and [N, M, C, T, V] for GCN
+            recognizer.
+        backbone_name (str, optional): Backbone name to specifying special
+            operations.Currently supports: `'tsm'`, `'slowfast'`, and `'gcn'`.
+            Defaults to None, means take the input as normal feature.
+        num_segments (int, optional): Number of frame segments for TSM
+            backbone. Defaults to None.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 spatial_type: str = 'avg',
+                 temporal_type: str = 'avg',
+                 backbone_name: Optional[str] = None,
+                 num_segments: Optional[str] = None,
+                 **kwargs) -> None:
+        super().__init__(None, None, **kwargs)
+
+        self.temporal_type = temporal_type
+        self.backbone_name = backbone_name
+        self.num_segments = num_segments
+        if spatial_type == 'avg':
+            self.pool2d = torch.mean
+        elif spatial_type == 'max':
+            self.pool2d = torch.max
+        elif spatial_type is None:
+            self.pool2d = lambda x, dim: x
+        else:
+            raise NotImplementedError(
+                f'Unsupported spatial_type {spatial_type}')
+
+        if temporal_type == 'avg':
+            self.pool1d = torch.mean
+        elif temporal_type == 'max':
+            self.pool1d = torch.max
+        elif temporal_type is None:
+            self.pool1d = lambda x, dim: x
+        else:
+            raise NotImplementedError(
+                f'Unsupported temporal_type {temporal_type}')
+
+    def forward(self,
+                x: Tensor,
+                num_segs: Optional[int] = None,
+                **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+            num_segs (int): For 2D backbone. Number of segments into which
+                a video is divided. Defaults to None.
+        Returns:
+            Tensor: The output features after pooling.
+        """
+        if isinstance(x, Tensor):
+            n_dims = x.ndim
+        elif isinstance(x, tuple):
+            n_dims = x[0].ndim
+            assert self.backbone_name == 'slowfast', \
+                'Only support SlowFast backbone to input tuple'
+        else:
+            raise NotImplementedError(f'Unsupported feature type: {type(x)}')
+        # For 2D backbone with spatial dimension
+        if n_dims == 4:
+            assert num_segs is not None
+            if self.backbone_name == 'tsm':
+                assert self.num_segments is not None, \
+                    'Please Specify num_segments for TSM'
+                num_segs = self.num_segments
+            # [N, T, channels, H, W]
+            x = x.view((-1, num_segs) + x.shape[1:])
+            feat = self.pool1d(self.pool2d(x, dim=[-2, -1]), dim=1)
+
+        elif n_dims == 5:
+            if self.backbone_name == 'slowfast':
+                x_slow, x_fast = x
+                assert self.temporal_type is not None, \
+                    'slowfast backbone has to pool temporal dimension'
+                x_fast = self.pool1d(self.pool2d(x_fast, dim=[-2, -1]), dim=2)
+                x_slow = self.pool1d(self.pool2d(x_slow, dim=[-2, -1]), dim=2)
+                feat = torch.cat((x_slow, x_fast), dim=1)
+
+            # For GCN-based backbone
+            elif self.backbone_name == 'gcn':
+                # N, M, C, T, V
+                feat = self.pool1d(self.pool2d(x, dim=[-2, -1]), dim=1)
+            # For 3D backbone with spatial dimension
+            else:
+                # [N, channels, T, H, W]
+                feat = self.pool1d(self.pool2d(x, dim=[-2, -1]), dim=2)
+        # For backbone output feature without spatial and temporal dimension
+        elif n_dims == 2:
+            # [N, channels]
+            feat = x
+
+        return feat
+
+    def predict_by_feat(self, feats: Union[Tensor, Tuple[Tensor]],
+                        data_samples) -> Tensor:
+        """Integrate multi-view features into one tensor.
+
+        Args:
+            feats (torch.Tensor | tuple[torch.Tensor]): Features from
+                upstream network.
+            data_samples (list[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+            Tensor: The integrated multi-view features.
+        """
+        num_segs = feats.shape[0] // len(data_samples)
+        feats = self.average_clip(feats, num_segs=num_segs)
+
+        return feats
diff --git a/mmaction/models/heads/gcn_head.py b/mmaction/models/heads/gcn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d21504640c64fad13ff714cb0d12fe9bdfb2338
--- /dev/null
+++ b/mmaction/models/heads/gcn_head.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Union
+
+import torch
+import torch.nn as nn
+
+from mmaction.registry import MODELS
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class GCNHead(BaseHead):
+    """The classification head for GCN.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict): Config for building loss.
+            Defaults to ``dict(type='CrossEntropyLoss')``.
+        dropout (float): Probability of dropout layer. Defaults to 0.
+        init_cfg (dict or list[dict]): Config to control the initialization.
+            Defaults to ``dict(type='Normal', layer='Linear', std=0.01)``.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_cls: Dict = dict(type='CrossEntropyLoss'),
+                 dropout: float = 0.,
+                 average_clips: str = 'prob',
+                 init_cfg: Union[Dict, List[Dict]] = dict(
+                     type='Normal', layer='Linear', std=0.01),
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes,
+            in_channels,
+            loss_cls=loss_cls,
+            average_clips=average_clips,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.dropout_ratio = dropout
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(self.in_channels, self.num_classes)
+
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (torch.Tensor): Features from the upstream network.
+
+        Returns:
+            torch.Tensor: Classification scores with shape (B, num_classes).
+        """
+
+        N, M, C, T, V = x.shape
+        x = x.view(N * M, C, T, V)
+        x = self.pool(x)
+        x = x.view(N, M, C)
+        x = x.mean(dim=1)
+        assert x.shape[1] == self.in_channels
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        cls_scores = self.fc(x)
+        return cls_scores
diff --git a/mmaction/models/heads/i3d_head.py b/mmaction/models/heads/i3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..53ad1b4243fbc46624cbfcb149e35cd0930f1190
--- /dev/null
+++ b/mmaction/models/heads/i3d_head.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.model.weight_init import normal_init
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class I3DHead(BaseHead):
+    """Classification head for I3D.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict or ConfigDict): Config for building loss.
+            Default: dict(type='CrossEntropyLoss')
+        spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.
+        dropout_ratio (float): Probability of dropout layer. Default: 0.5.
+        init_std (float): Std value for Initiation. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+                 spatial_type: str = 'avg',
+                 dropout_ratio: float = 0.5,
+                 init_std: float = 0.01,
+                 **kwargs) -> None:
+        super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+
+        self.spatial_type = spatial_type
+        self.dropout_ratio = dropout_ratio
+        self.init_std = init_std
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
+
+        if self.spatial_type == 'avg':
+            # use `nn.AdaptiveAvgPool3d` to adaptively match the in_channels.
+            self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        else:
+            self.avg_pool = None
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        normal_init(self.fc_cls, std=self.init_std)
+
+    def forward(self, x: Tensor, **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        # [N, in_channels, 4, 7, 7]
+        if self.avg_pool is not None:
+            x = self.avg_pool(x)
+        # [N, in_channels, 1, 1, 1]
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N, in_channels, 1, 1, 1]
+        x = x.view(x.shape[0], -1)
+        # [N, in_channels]
+        cls_score = self.fc_cls(x)
+        # [N, num_classes]
+        return cls_score
diff --git a/mmaction/models/heads/mvit_head.py b/mmaction/models/heads/mvit_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d832f4a09c230ab6901f66e4a275c0e20641601a
--- /dev/null
+++ b/mmaction/models/heads/mvit_head.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+from mmengine.model.weight_init import constant_init, trunc_normal_init
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class MViTHead(BaseHead):
+    """Classification head for Multi-scale ViT.
+
+    A PyTorch implement of : `MViTv2: Improved Multiscale Vision Transformers
+    for Classification and Detection <https://arxiv.org/abs/2112.01526>`_
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict or ConfigDict): Config for building loss.
+            Defaults to `dict(type='CrossEntropyLoss')`.
+        dropout_ratio (float): Probability of dropout layer. Defaults to 0.5.
+        init_std (float): Std value for Initiation. Defaults to 0.02.
+        init_scale (float): Scale factor for Initiation parameters.
+            Defaults to 1.
+        with_cls_token (bool): Whether the backbone output feature with
+            cls_token. Defaults to True.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+                 dropout_ratio: float = 0.5,
+                 init_std: float = 0.02,
+                 init_scale: float = 1.0,
+                 with_cls_token: bool = True,
+                 **kwargs) -> None:
+        super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+        self.init_std = init_std
+        self.init_scale = init_scale
+        self.dropout_ratio = dropout_ratio
+        self.with_cls_token = with_cls_token
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        trunc_normal_init(self.fc_cls.weight, std=self.init_std)
+        constant_init(self.fc_cls.bias, 0.02)
+        self.fc_cls.weight.data.mul_(self.init_scale)
+        self.fc_cls.bias.data.mul_(self.init_scale)
+
+    def pre_logits(self, feats: Tuple[List[Tensor]]) -> Tensor:
+        """The process before the final classification head.
+
+        The input ``feats`` is a tuple of list of tensor, and each tensor is
+        the feature of a backbone stage.
+        """
+        if self.with_cls_token:
+            _, cls_token = feats[-1]
+            return cls_token
+        else:
+            patch_token = feats[-1]
+            return patch_token.mean(dim=(2, 3, 4))
+
+    def forward(self, x: Tuple[List[Tensor]], **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tuple[List[Tensor]]): The input data.
+
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        x = self.pre_logits(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N, in_channels]
+        cls_score = self.fc_cls(x)
+        # [N, num_classes]
+        return cls_score
diff --git a/mmaction/models/heads/omni_head.py b/mmaction/models/heads/omni_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..04c42e603dfc88b9c2e781a2d3ea76317df5434d
--- /dev/null
+++ b/mmaction/models/heads/omni_head.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from mmaction.evaluation import top_k_accuracy
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType, SampleList
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class OmniHead(BaseHead):
+    """Classification head for OmniResNet that accepts both image and video
+    inputs.
+
+    Args:
+        image_classes (int): Number of image classes to be classified.
+        video_classes (int): Number of video classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict or ConfigDict): Config for building loss.
+            Default: dict(type='CrossEntropyLoss')
+        image_dropout_ratio (float): Probability of dropout layer for the image
+            head. Defaults to 0.2.
+        video_dropout_ratio (float): Probability of dropout layer for the video
+            head. Defaults to 0.5.
+        video_nl_head (bool): if true, use a non-linear head for the video
+            head. Defaults to True.
+    """
+
+    def __init__(self,
+                 image_classes: int,
+                 video_classes: int,
+                 in_channels: int,
+                 loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+                 image_dropout_ratio: float = 0.2,
+                 video_dropout_ratio: float = 0.5,
+                 video_nl_head: bool = True,
+                 **kwargs) -> None:
+        super().__init__(image_classes, in_channels, loss_cls, **kwargs)
+
+        self.fc2d = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1), nn.Flatten(), nn.BatchNorm1d(in_channels),
+            nn.Dropout(image_dropout_ratio),
+            nn.Linear(in_channels, image_classes))
+
+        if video_nl_head:
+            self.fc3d = nn.Sequential(
+                nn.AdaptiveAvgPool3d(1), nn.Flatten(),
+                nn.Linear(in_channels, video_classes * 2),
+                nn.BatchNorm1d(video_classes * 2), nn.ReLU(inplace=True),
+                nn.Dropout(video_dropout_ratio),
+                nn.Linear(video_classes * 2, video_classes))
+        else:
+            self.fc3d = nn.Sequential(
+                nn.AdaptiveAvgPool3d(1), nn.Flatten(),
+                nn.BatchNorm1d(in_channels), nn.Dropout(video_dropout_ratio),
+                nn.Linear(in_channels, video_classes))
+
+    def forward(self, x: Tensor, **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        if len(x.shape) == 4:
+            cls_score = self.fc2d(x)
+        else:
+            cls_score = self.fc3d(x)
+        return cls_score
+
+    def loss_by_feat(self, cls_scores: Union[Tensor, Tuple[Tensor]],
+                     data_samples: SampleList) -> dict:
+        """Calculate the loss based on the features extracted by the head.
+
+        Args:
+            cls_scores (Tensor): Classification prediction results of
+                all class, has shape (batch_size, num_classes).
+            data_samples (List[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        labels = [x.gt_label for x in data_samples]
+        labels = torch.stack(labels).to(cls_scores.device)
+        labels = labels.squeeze()
+
+        losses = dict()
+        if labels.shape == torch.Size([]):
+            labels = labels.unsqueeze(0)
+        elif labels.dim() == 1 and cls_scores.size()[0] == 1:
+            # Fix a bug when training with soft labels and batch size is 1.
+            # When using soft labels, `labels` and `cls_socre` share the same
+            # shape.
+            labels = labels.unsqueeze(0)
+
+        if cls_scores.size() != labels.size():
+            top_k_acc = top_k_accuracy(cls_scores.detach().cpu().numpy(),
+                                       labels.detach().cpu().numpy(),
+                                       self.topk)
+            for k, a in zip(self.topk, top_k_acc):
+                losses[f'top{k}_acc'] = torch.tensor(
+                    a, device=cls_scores.device)
+        if self.label_smooth_eps != 0:
+            if cls_scores.size() != labels.size():
+                labels = F.one_hot(labels, num_classes=self.num_classes)
+            labels = ((1 - self.label_smooth_eps) * labels +
+                      self.label_smooth_eps / self.num_classes)
+
+        loss_cls = self.loss_cls(cls_scores, labels)
+        # loss_cls may be dictionary or single tensor
+        if isinstance(loss_cls, dict):
+            losses.update(loss_cls)
+        else:
+            losses['loss_cls'] = loss_cls
+        return losses
diff --git a/mmaction/models/heads/rgbpose_head.py b/mmaction/models/heads/rgbpose_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff68e14842a57da5fc479f4f85149c1c8d2188e3
--- /dev/null
+++ b/mmaction/models/heads/rgbpose_head.py
@@ -0,0 +1,229 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model.weight_init import normal_init
+
+from mmaction.evaluation import top_k_accuracy
+from mmaction.registry import MODELS
+from mmaction.utils import SampleList
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class RGBPoseHead(BaseHead):
+    """The classification head for RGBPoseConv3D.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (tuple[int]): Number of channels in input feature.
+        loss_cls (dict): Config for building loss.
+            Defaults to ``dict(type='CrossEntropyLoss')``.
+        loss_components (list[str]): The components of the loss.
+            Defaults to ``['rgb', 'pose']``.
+        loss_weights (float or tuple[float]): The weights of the losses.
+            Defaults to 1.
+        dropout (float): Probability of dropout layer. Default: 0.5.
+        init_std (float): Std value for Initiation. Default: 0.01.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Tuple[int],
+                 loss_cls: Dict = dict(type='CrossEntropyLoss'),
+                 loss_components: List[str] = ['rgb', 'pose'],
+                 loss_weights: Union[float, Tuple[float]] = 1.,
+                 dropout: float = 0.5,
+                 init_std: float = 0.01,
+                 **kwargs) -> None:
+        super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+        if isinstance(dropout, float):
+            dropout = {'rgb': dropout, 'pose': dropout}
+        assert isinstance(dropout, dict)
+
+        if loss_components is not None:
+            self.loss_components = loss_components
+            if isinstance(loss_weights, float):
+                loss_weights = [loss_weights] * len(loss_components)
+            assert len(loss_weights) == len(loss_components)
+            self.loss_weights = loss_weights
+
+        self.dropout = dropout
+        self.init_std = init_std
+
+        self.dropout_rgb = nn.Dropout(p=self.dropout['rgb'])
+        self.dropout_pose = nn.Dropout(p=self.dropout['pose'])
+
+        self.fc_rgb = nn.Linear(self.in_channels[0], num_classes)
+        self.fc_pose = nn.Linear(self.in_channels[1], num_classes)
+        self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        normal_init(self.fc_rgb, std=self.init_std)
+        normal_init(self.fc_pose, std=self.init_std)
+
+    def forward(self, x: Tuple[torch.Tensor]) -> Dict:
+        """Defines the computation performed at every call."""
+        x_rgb, x_pose = self.avg_pool(x[0]), self.avg_pool(x[1])
+        x_rgb = x_rgb.view(x_rgb.size(0), -1)
+        x_pose = x_pose.view(x_pose.size(0), -1)
+
+        x_rgb = self.dropout_rgb(x_rgb)
+        x_pose = self.dropout_pose(x_pose)
+
+        cls_scores = dict()
+        cls_scores['rgb'] = self.fc_rgb(x_rgb)
+        cls_scores['pose'] = self.fc_pose(x_pose)
+
+        return cls_scores
+
+    def loss(self, feats: Tuple[torch.Tensor], data_samples: SampleList,
+             **kwargs) -> Dict:
+        """Perform forward propagation of head and loss calculation on the
+        features of the upstream network.
+
+        Args:
+            feats (tuple[torch.Tensor]): Features from upstream network.
+            data_samples (list[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        cls_scores = self(feats, **kwargs)
+        return self.loss_by_feat(cls_scores, data_samples)
+
+    def loss_by_feat(self, cls_scores: Dict[str, torch.Tensor],
+                     data_samples: SampleList) -> Dict:
+        """Calculate the loss based on the features extracted by the head.
+
+        Args:
+            cls_scores (dict[str, torch.Tensor]): The dict of
+                classification scores,
+            data_samples (list[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        labels = torch.stack([x.gt_label for x in data_samples])
+        labels = labels.squeeze()
+
+        if labels.shape == torch.Size([]):
+            labels = labels.unsqueeze(0)
+        elif labels.dim() == 1 and labels.size()[0] == self.num_classes \
+                and cls_scores.size()[0] == 1:
+            # Fix a bug when training with soft labels and batch size is 1.
+            # When using soft labels, `labels` and `cls_score` share the same
+            # shape.
+            labels = labels.unsqueeze(0)
+
+        losses = dict()
+        for loss_name, weight in zip(self.loss_components, self.loss_weights):
+            cls_score = cls_scores[loss_name]
+            loss_cls = self.loss_by_scores(cls_score, labels)
+            loss_cls = {loss_name + '_' + k: v for k, v in loss_cls.items()}
+            loss_cls[f'{loss_name}_loss_cls'] *= weight
+            losses.update(loss_cls)
+        return losses
+
+    def loss_by_scores(self, cls_scores: torch.Tensor,
+                       labels: torch.Tensor) -> Dict:
+        """Calculate the loss based on the features extracted by the head.
+
+        Args:
+            cls_scores (torch.Tensor): Classification prediction
+                results of all class, has shape (batch_size, num_classes).
+            labels (torch.Tensor): The labels used to calculate the loss.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        losses = dict()
+        if cls_scores.size() != labels.size():
+            top_k_acc = top_k_accuracy(cls_scores.detach().cpu().numpy(),
+                                       labels.detach().cpu().numpy(),
+                                       self.topk)
+            for k, a in zip(self.topk, top_k_acc):
+                losses[f'top{k}_acc'] = torch.tensor(
+                    a, device=cls_scores.device)
+        if self.label_smooth_eps != 0:
+            if cls_scores.size() != labels.size():
+                labels = F.one_hot(labels, num_classes=self.num_classes)
+            labels = ((1 - self.label_smooth_eps) * labels +
+                      self.label_smooth_eps / self.num_classes)
+
+        loss_cls = self.loss_cls(cls_scores, labels)
+        # loss_cls may be dictionary or single tensor
+        if isinstance(loss_cls, dict):
+            losses.update(loss_cls)
+        else:
+            losses['loss_cls'] = loss_cls
+        return losses
+
+    def predict(self, feats: Tuple[torch.Tensor], data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Perform forward propagation of head and predict recognition results
+        on the features of the upstream network.
+
+        Args:
+            feats (tuple[torch.Tensor]): Features from upstream network.
+            data_samples (list[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+             list[:obj:`ActionDataSample`]: Recognition results wrapped
+                by :obj:`ActionDataSample`.
+        """
+        cls_scores = self(feats, **kwargs)
+        return self.predict_by_feat(cls_scores, data_samples)
+
+    def predict_by_feat(self, cls_scores: Dict[str, torch.Tensor],
+                        data_samples: SampleList) -> SampleList:
+        """Transform a batch of output features extracted from the head into
+        prediction results.
+
+        Args:
+            cls_scores (dict[str, torch.Tensor]): The dict of
+                classification scores,
+            data_samples (list[:obj:`ActionDataSample`]): The
+                annotation data of every samples. It usually includes
+                information such as `gt_label`.
+
+        Returns:
+            list[:obj:`ActionDataSample`]: Recognition results wrapped
+                by :obj:`ActionDataSample`.
+        """
+        pred_scores = [dict() for _ in range(len(data_samples))]
+
+        for name in self.loss_components:
+            cls_score = cls_scores[name]
+            cls_score = self.predict_by_scores(cls_score, data_samples)
+            for pred_score, score in zip(pred_scores, cls_score):
+                pred_score[f'{name}'] = score
+
+        for data_sample, pred_score, in zip(data_samples, pred_scores):
+            data_sample.set_pred_score(pred_score)
+        return data_samples
+
+    def predict_by_scores(self, cls_scores: torch.Tensor,
+                          data_samples: SampleList) -> torch.Tensor:
+        """Transform a batch of output features extracted from the head into
+        prediction results.
+
+        Args:
+            cls_scores (torch.Tensor): Classification scores, has a shape
+                (B*num_segs, num_classes)
+            data_samples (list[:obj:`ActionDataSample`]): The annotation
+                data of every samples.
+
+        Returns:
+            torch.Tensor: The averaged classification scores.
+        """
+
+        num_segs = cls_scores.shape[0] // len(data_samples)
+        cls_scores = self.average_clip(cls_scores, num_segs=num_segs)
+        return cls_scores
diff --git a/mmaction/models/heads/slowfast_head.py b/mmaction/models/heads/slowfast_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6745cf32ec4f99132d0c7b7c555c2b515f303ffb
--- /dev/null
+++ b/mmaction/models/heads/slowfast_head.py
@@ -0,0 +1,83 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from mmengine.model.weight_init import normal_init
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class SlowFastHead(BaseHead):
+    """The classification head for SlowFast.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict or ConfigDict): Config for building loss.
+            Default: dict(type='CrossEntropyLoss').
+        spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.
+        dropout_ratio (float): Probability of dropout layer. Default: 0.8.
+        init_std (float): Std value for Initiation. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+                 spatial_type: str = 'avg',
+                 dropout_ratio: float = 0.8,
+                 init_std: float = 0.01,
+                 **kwargs) -> None:
+
+        super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+        self.spatial_type = spatial_type
+        self.dropout_ratio = dropout_ratio
+        self.init_std = init_std
+
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(in_channels, num_classes)
+
+        if self.spatial_type == 'avg':
+            self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        else:
+            self.avg_pool = None
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        normal_init(self.fc_cls, std=self.init_std)
+
+    def forward(self, x: Tuple[Tensor], **kwargs) -> None:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (tuple[torch.Tensor]): The input data.
+
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        # ([N, channel_slow, T1, H, W], [(N, channel_fast, T2, H, W)])
+        x_slow, x_fast = x
+        # ([N, channel_slow, 1, 1, 1], [N, channel_fast, 1, 1, 1])
+        x_slow = self.avg_pool(x_slow)
+        x_fast = self.avg_pool(x_fast)
+        # [N, channel_fast + channel_slow, 1, 1, 1]
+        x = torch.cat((x_fast, x_slow), dim=1)
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        # [N x C]
+        x = x.view(x.size(0), -1)
+        # [N x num_classes]
+        cls_score = self.fc_cls(x)
+
+        return cls_score
diff --git a/mmaction/models/heads/timesformer_head.py b/mmaction/models/heads/timesformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..291fa28a86b8dcdffa2a5ef51433f4258ae02964
--- /dev/null
+++ b/mmaction/models/heads/timesformer_head.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.model.weight_init import trunc_normal_init
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class TimeSformerHead(BaseHead):
+    """Classification head for TimeSformer.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict or ConfigDict): Config for building loss.
+            Defaults to `dict(type='CrossEntropyLoss')`.
+        init_std (float): Std value for Initiation. Defaults to 0.02.
+        dropout_ratio (float): Probability of dropout layer.
+            Defaults to : 0.0.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+                 init_std: float = 0.02,
+                 dropout_ratio: float = 0.0,
+                 **kwargs) -> None:
+        super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+        self.init_std = init_std
+        self.dropout_ratio = dropout_ratio
+
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        trunc_normal_init(self.fc_cls, std=self.init_std)
+
+    def forward(self, x: Tensor, **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        # [N, in_channels]
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N, in_channels]
+        cls_score = self.fc_cls(x)
+        # [N, num_classes]
+        return cls_score
diff --git a/mmaction/models/heads/tpn_head.py b/mmaction/models/heads/tpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b90d979883fa2f904edc9a439be6a9269cbcacbf
--- /dev/null
+++ b/mmaction/models/heads/tpn_head.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+from mmengine.device import get_device
+from torch import Tensor
+
+from mmaction.registry import MODELS
+from .tsn_head import TSNHead
+
+
+@MODELS.register_module()
+class TPNHead(TSNHead):
+    """Class head for TPN."""
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+        if self.spatial_type == 'avg':
+            # use `nn.AdaptiveAvgPool3d` to adaptively match the in_channels.
+            self.avg_pool3d = nn.AdaptiveAvgPool3d((1, 1, 1))
+        else:
+            self.avg_pool3d = None
+
+        self.avg_pool2d = None
+        self.new_cls = None
+
+    def _init_new_cls(self) -> None:
+        self.new_cls = nn.Conv3d(self.in_channels, self.num_classes, 1, 1, 0)
+        self.new_cls = self.new_cls.to(get_device())
+        self.new_cls.weight.copy_(self.fc_cls.weight[..., None, None, None])
+        self.new_cls.bias.copy_(self.fc_cls.bias)
+
+    def forward(self,
+                x,
+                num_segs: Optional[int] = None,
+                fcn_test: bool = False,
+                **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+            num_segs (int, optional): Number of segments into which a video
+                is divided. Defaults to None.
+            fcn_test (bool): Whether to apply full convolution (fcn) testing.
+                Defaults to False.
+
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        if fcn_test:
+            if self.avg_pool3d:
+                x = self.avg_pool3d(x)
+            if self.new_cls is None:
+                self._init_new_cls()
+            x = self.new_cls(x)
+            cls_score_feat_map = x.view(x.size(0), -1)
+            return cls_score_feat_map
+
+        if self.avg_pool2d is None:
+            kernel_size = (1, x.shape[-2], x.shape[-1])
+            self.avg_pool2d = nn.AvgPool3d(kernel_size, stride=1, padding=0)
+
+        if num_segs is None:
+            # [N, in_channels, 3, 7, 7]
+            x = self.avg_pool3d(x)
+        else:
+            # [N * num_segs, in_channels, 7, 7]
+            x = self.avg_pool2d(x)
+            # [N * num_segs, in_channels, 1, 1]
+            x = x.reshape((-1, num_segs) + x.shape[1:])
+            # [N, num_segs, in_channels, 1, 1]
+            x = self.consensus(x)
+            # [N, 1, in_channels, 1, 1]
+            x = x.squeeze(1)
+            # [N, in_channels, 1, 1]
+        if self.dropout is not None:
+            x = self.dropout(x)
+            # [N, in_channels, 1, 1]
+        x = x.view(x.size(0), -1)
+        # [N, in_channels]
+        cls_score = self.fc_cls(x)
+        # [N, num_classes]
+        return cls_score
diff --git a/mmaction/models/heads/trn_head.py b/mmaction/models/heads/trn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aa2b0bcd58157279fffd270a2055a1dabd1dff6
--- /dev/null
+++ b/mmaction/models/heads/trn_head.py
@@ -0,0 +1,218 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmengine.model.weight_init import normal_init
+
+from mmaction.registry import MODELS
+from .base import BaseHead
+
+
+class RelationModule(nn.Module):
+    """Relation Module of TRN.
+
+    Args:
+        hidden_dim (int): The dimension of hidden layer of MLP in relation
+            module.
+        num_segments (int): Number of frame segments.
+        num_classes (int): Number of classes to be classified.
+    """
+
+    def __init__(self, hidden_dim, num_segments, num_classes):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_segments = num_segments
+        self.num_classes = num_classes
+        bottleneck_dim = 512
+        self.classifier = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(self.num_segments * self.hidden_dim, bottleneck_dim),
+            nn.ReLU(), nn.Linear(bottleneck_dim, self.num_classes))
+
+    def init_weights(self):
+        """Use the default kaiming_uniform for all nn.linear layers."""
+        pass
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        # [N, num_segs * hidden_dim]
+        x = x.view(x.size(0), -1)
+        x = self.classifier(x)
+        return x
+
+
+class RelationModuleMultiScale(nn.Module):
+    """Relation Module with Multi Scale of TRN.
+
+    Args:
+        hidden_dim (int): The dimension of hidden layer of MLP in relation
+            module.
+        num_segments (int): Number of frame segments.
+        num_classes (int): Number of classes to be classified.
+    """
+
+    def __init__(self, hidden_dim, num_segments, num_classes):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_segments = num_segments
+        self.num_classes = num_classes
+
+        # generate the multiple frame relations
+        self.scales = range(num_segments, 1, -1)
+
+        self.relations_scales = []
+        self.subsample_scales = []
+        max_subsample = 3
+        for scale in self.scales:
+            # select the different frame features for different scales
+            relations_scale = list(
+                itertools.combinations(range(self.num_segments), scale))
+            self.relations_scales.append(relations_scale)
+            # sample `max_subsample` relation_scale at most
+            self.subsample_scales.append(
+                min(max_subsample, len(relations_scale)))
+        assert len(self.relations_scales[0]) == 1
+
+        bottleneck_dim = 256
+        self.fc_fusion_scales = nn.ModuleList()
+        for scale in self.scales:
+            fc_fusion = nn.Sequential(
+                nn.ReLU(), nn.Linear(scale * self.hidden_dim, bottleneck_dim),
+                nn.ReLU(), nn.Linear(bottleneck_dim, self.num_classes))
+            self.fc_fusion_scales.append(fc_fusion)
+
+    def init_weights(self):
+        """Use the default kaiming_uniform for all nn.linear layers."""
+        pass
+
+    def forward(self, x):
+        # the first one is the largest scale
+        act_all = x[:, self.relations_scales[0][0], :]
+        act_all = act_all.view(
+            act_all.size(0), self.scales[0] * self.hidden_dim)
+        act_all = self.fc_fusion_scales[0](act_all)
+
+        for scaleID in range(1, len(self.scales)):
+            # iterate over the scales
+            idx_relations_randomsample = np.random.choice(
+                len(self.relations_scales[scaleID]),
+                self.subsample_scales[scaleID],
+                replace=False)
+            for idx in idx_relations_randomsample:
+                act_relation = x[:, self.relations_scales[scaleID][idx], :]
+                act_relation = act_relation.view(
+                    act_relation.size(0),
+                    self.scales[scaleID] * self.hidden_dim)
+                act_relation = self.fc_fusion_scales[scaleID](act_relation)
+                act_all += act_relation
+        return act_all
+
+
+@MODELS.register_module()
+class TRNHead(BaseHead):
+    """Class head for TRN.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        num_segments (int): Number of frame segments. Default: 8.
+        loss_cls (dict): Config for building loss. Default:
+            dict(type='CrossEntropyLoss')
+        spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.
+        relation_type (str): The relation module type. Choices are 'TRN' or
+            'TRNMultiScale'. Default: 'TRNMultiScale'.
+        hidden_dim (int): The dimension of hidden layer of MLP in relation
+            module. Default: 256.
+        dropout_ratio (float): Probability of dropout layer. Default: 0.8.
+        init_std (float): Std value for Initiation. Default: 0.001.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 num_segments=8,
+                 loss_cls=dict(type='CrossEntropyLoss'),
+                 spatial_type='avg',
+                 relation_type='TRNMultiScale',
+                 hidden_dim=256,
+                 dropout_ratio=0.8,
+                 init_std=0.001,
+                 **kwargs):
+        super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.num_segments = num_segments
+        self.spatial_type = spatial_type
+        self.relation_type = relation_type
+        self.hidden_dim = hidden_dim
+        self.dropout_ratio = dropout_ratio
+        self.init_std = init_std
+
+        if self.relation_type == 'TRN':
+            self.consensus = RelationModule(self.hidden_dim, self.num_segments,
+                                            self.num_classes)
+        elif self.relation_type == 'TRNMultiScale':
+            self.consensus = RelationModuleMultiScale(self.hidden_dim,
+                                                      self.num_segments,
+                                                      self.num_classes)
+        else:
+            raise ValueError(f'Unknown Relation Type {self.relation_type}!')
+
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(self.in_channels, self.hidden_dim)
+
+        if self.spatial_type == 'avg':
+            # use `nn.AdaptiveAvgPool2d` to adaptively match the in_channels.
+            self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        else:
+            self.avg_pool = None
+
+    def init_weights(self):
+        """Initiate the parameters from scratch."""
+        normal_init(self.fc_cls, std=self.init_std)
+        self.consensus.init_weights()
+
+    def forward(self, x, num_segs, **kwargs):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+            num_segs (int): Useless in TRNHead. By default, `num_segs`
+                is equal to `clip_len * num_clips * num_crops`, which is
+                automatically generated in Recognizer forward phase and
+                useless in TRN models. The `self.num_segments` we need is a
+                hyper parameter to build TRN models.
+        Returns:
+            torch.Tensor: The classification scores for input samples.
+        """
+        # [N * num_segs, in_channels, 7, 7]
+        if self.avg_pool is not None:
+            x = self.avg_pool(x)
+        # [N * num_segs, in_channels, 1, 1]
+        x = torch.flatten(x, 1)
+        # [N * num_segs, in_channels]
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        # [N, num_segs, hidden_dim]
+        cls_score = self.fc_cls(x)
+        cls_score = cls_score.view((-1, self.num_segments) +
+                                   cls_score.size()[1:])
+
+        # [N, num_classes]
+        cls_score = self.consensus(cls_score)
+        return cls_score
diff --git a/mmaction/models/heads/tsm_head.py b/mmaction/models/heads/tsm_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d2469d86c119fe06610f0ee1c08b435115ccea1
--- /dev/null
+++ b/mmaction/models/heads/tsm_head.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.model.weight_init import normal_init
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType, get_str_type
+from .base import AvgConsensus, BaseHead
+
+
+@MODELS.register_module()
+class TSMHead(BaseHead):
+    """Class head for TSM.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        num_segments (int): Number of frame segments. Default: 8.
+        loss_cls (dict or ConfigDict): Config for building loss.
+            Default: dict(type='CrossEntropyLoss')
+        spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.
+        consensus (dict or ConfigDict): Consensus config dict.
+        dropout_ratio (float): Probability of dropout layer. Default: 0.4.
+        init_std (float): Std value for Initiation. Default: 0.01.
+        is_shift (bool): Indicating whether the feature is shifted.
+            Default: True.
+        temporal_pool (bool): Indicating whether feature is temporal pooled.
+            Default: False.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 num_segments: int = 8,
+                 loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+                 spatial_type: str = 'avg',
+                 consensus: ConfigType = dict(type='AvgConsensus', dim=1),
+                 dropout_ratio: float = 0.8,
+                 init_std: float = 0.001,
+                 is_shift: bool = True,
+                 temporal_pool: bool = False,
+                 **kwargs) -> None:
+        super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+
+        self.spatial_type = spatial_type
+        self.dropout_ratio = dropout_ratio
+        self.num_segments = num_segments
+        self.init_std = init_std
+        self.is_shift = is_shift
+        self.temporal_pool = temporal_pool
+
+        consensus_ = consensus.copy()
+
+        consensus_type = consensus_.pop('type')
+        if get_str_type(consensus_type) == 'AvgConsensus':
+            self.consensus = AvgConsensus(**consensus_)
+        else:
+            self.consensus = None
+
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
+
+        if self.spatial_type == 'avg':
+            # use `nn.AdaptiveAvgPool2d` to adaptively match the in_channels.
+            self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        else:
+            self.avg_pool = None
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        normal_init(self.fc_cls, std=self.init_std)
+
+    def forward(self, x: Tensor, num_segs: int, **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+            num_segs (int): Useless in TSMHead. By default, `num_segs`
+                is equal to `clip_len * num_clips * num_crops`, which is
+                automatically generated in Recognizer forward phase and
+                useless in TSM models. The `self.num_segments` we need is a
+                hyper parameter to build TSM models.
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        # [N * num_segs, in_channels, 7, 7]
+        if self.avg_pool is not None:
+            x = self.avg_pool(x)
+        # [N * num_segs, in_channels, 1, 1]
+        x = torch.flatten(x, 1)
+        # [N * num_segs, in_channels]
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N * num_segs, num_classes]
+        cls_score = self.fc_cls(x)
+
+        if self.is_shift and self.temporal_pool:
+            # [2 * N, num_segs // 2, num_classes]
+            cls_score = cls_score.view((-1, self.num_segments // 2) +
+                                       cls_score.size()[1:])
+        else:
+            # [N, num_segs, num_classes]
+            cls_score = cls_score.view((-1, self.num_segments) +
+                                       cls_score.size()[1:])
+        # [N, 1, num_classes]
+        cls_score = self.consensus(cls_score)
+        # [N, num_classes]
+        return cls_score.squeeze(1)
diff --git a/mmaction/models/heads/tsn_audio_head.py b/mmaction/models/heads/tsn_audio_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..25fa7c1095db6fdc70d43b3bd53bbb56bf52f417
--- /dev/null
+++ b/mmaction/models/heads/tsn_audio_head.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.model.weight_init import normal_init
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class TSNAudioHead(BaseHead):
+    """Classification head for TSN on audio.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (Union[dict, ConfigDict]): Config for building loss.
+            Defaults to ``dict(type='CrossEntropyLoss')``.
+        spatial_type (str): Pooling type in spatial dimension.
+            Defaults to ``avg``.
+        dropout_ratio (float): Probability of dropout layer. Defaults to 0.4.
+        init_std (float): Std value for Initiation. Defaults to 0.01.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+                 spatial_type: str = 'avg',
+                 dropout_ratio: float = 0.4,
+                 init_std: float = 0.01,
+                 **kwargs) -> None:
+        super().__init__(num_classes, in_channels, loss_cls=loss_cls, **kwargs)
+
+        self.spatial_type = spatial_type
+        self.dropout_ratio = dropout_ratio
+        self.init_std = init_std
+
+        if self.spatial_type == 'avg':
+            # use `nn.AdaptiveAvgPool2d` to adaptively match the in_channels.
+            self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        else:
+            self.avg_pool = None
+
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        normal_init(self.fc_cls, std=self.init_std)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The classification scores for input samples.
+        """
+        # [N * num_segs, in_channels, h, w]
+        x = self.avg_pool(x)
+        # [N, in_channels, 1, 1]
+        x = x.view(x.size(0), -1)
+        # [N, in_channels]
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N, in_channels]
+        cls_score = self.fc_cls(x)
+        # [N, num_classes]
+        return cls_score
diff --git a/mmaction/models/heads/tsn_head.py b/mmaction/models/heads/tsn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e11ce7d15bea53d453eb03c79a3a9dfff7d9a925
--- /dev/null
+++ b/mmaction/models/heads/tsn_head.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.model.weight_init import normal_init
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType, get_str_type
+from .base import AvgConsensus, BaseHead
+
+
+@MODELS.register_module()
+class TSNHead(BaseHead):
+    """Class head for TSN.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict or ConfigDict): Config for building loss.
+            Default: dict(type='CrossEntropyLoss').
+        spatial_type (str or ConfigDict): Pooling type in spatial dimension.
+            Default: 'avg'.
+        consensus (dict): Consensus config dict.
+        dropout_ratio (float): Probability of dropout layer. Default: 0.4.
+        init_std (float): Std value for Initiation. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+                 spatial_type: str = 'avg',
+                 consensus: ConfigType = dict(type='AvgConsensus', dim=1),
+                 dropout_ratio: float = 0.4,
+                 init_std: float = 0.01,
+                 **kwargs) -> None:
+        super().__init__(num_classes, in_channels, loss_cls=loss_cls, **kwargs)
+
+        self.spatial_type = spatial_type
+        self.dropout_ratio = dropout_ratio
+        self.init_std = init_std
+
+        consensus_ = consensus.copy()
+
+        consensus_type = consensus_.pop('type')
+        if get_str_type(consensus_type) == 'AvgConsensus':
+            self.consensus = AvgConsensus(**consensus_)
+        else:
+            self.consensus = None
+
+        if self.spatial_type == 'avg':
+            # use `nn.AdaptiveAvgPool2d` to adaptively match the in_channels.
+            self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        else:
+            self.avg_pool = None
+
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        normal_init(self.fc_cls, std=self.init_std)
+
+    def forward(self, x: Tensor, num_segs: int, **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+            num_segs (int): Number of segments into which a video
+                is divided.
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        # [N * num_segs, in_channels, 7, 7]
+        if self.avg_pool is not None:
+            if isinstance(x, tuple):
+                shapes = [y.shape for y in x]
+                assert 1 == 0, f'x is tuple {shapes}'
+            x = self.avg_pool(x)
+            # [N * num_segs, in_channels, 1, 1]
+        x = x.reshape((-1, num_segs) + x.shape[1:])
+        # [N, num_segs, in_channels, 1, 1]
+        x = self.consensus(x)
+        # [N, 1, in_channels, 1, 1]
+        x = x.squeeze(1)
+        # [N, in_channels, 1, 1]
+        if self.dropout is not None:
+            x = self.dropout(x)
+            # [N, in_channels, 1, 1]
+        x = x.view(x.size(0), -1)
+        # [N, in_channels]
+        cls_score = self.fc_cls(x)
+        # [N, num_classes]
+        return cls_score
diff --git a/mmaction/models/heads/uniformer_head.py b/mmaction/models/heads/uniformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aa296de665281eb9240ea1f628fe577cb019d58
--- /dev/null
+++ b/mmaction/models/heads/uniformer_head.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from mmengine.fileio import load
+from mmengine.logging import MMLogger
+from mmengine.runner.checkpoint import _load_checkpoint_with_prefix
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType, get_str_type
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class UniFormerHead(BaseHead):
+    """Classification head for UniFormer. supports loading pretrained
+    Kinetics-710 checkpoint to fine-tuning on other Kinetics dataset.
+
+    A pytorch implement of: `UniFormerV2: Spatiotemporal
+    Learning by Arming Image ViTs with Video UniFormer
+    <https://arxiv.org/abs/2211.09552>`
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict or ConfigDict): Config for building loss.
+            Defaults to `dict(type='CrossEntropyLoss')`.
+        dropout_ratio (float): Probability of dropout layer.
+            Defaults to : 0.0.
+        channel_map (str, optional): Channel map file to selecting
+            channels from pretrained head with extra channels.
+            Defaults to None.
+        init_cfg (dict or ConfigDict, optional): Config to control the
+           initialization. Defaults to
+           ``[
+            dict(type='TruncNormal', layer='Linear', std=0.01)
+           ]``.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+                 dropout_ratio: float = 0.0,
+                 channel_map: Optional[str] = None,
+                 init_cfg: Optional[dict] = dict(
+                     type='TruncNormal', layer='Linear', std=0.02),
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes, in_channels, loss_cls, init_cfg=init_cfg, **kwargs)
+        self.channel_map = channel_map
+        self.dropout_ratio = dropout_ratio
+
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
+
+    def _select_channels(self, stact_dict):
+        selected_channels = load(self.channel_map)
+        for key in stact_dict:
+            stact_dict[key] = stact_dict[key][selected_channels]
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        if get_str_type(self.init_cfg['type']) == 'Pretrained':
+            assert self.channel_map is not None, \
+                'load cls_head weights needs to specify the channel map file'
+            logger = MMLogger.get_current_instance()
+            pretrained = self.init_cfg['checkpoint']
+            logger.info(f'load pretrained model from {pretrained}')
+            state_dict = _load_checkpoint_with_prefix(
+                'cls_head.', pretrained, map_location='cpu')
+            self._select_channels(state_dict)
+            msg = self.load_state_dict(state_dict, strict=False)
+            logger.info(msg)
+        else:
+            super().init_weights()
+
+    def forward(self, x: Tensor, **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        # [N, in_channels]
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N, in_channels]
+        cls_score = self.fc_cls(x)
+        # [N, num_classes]
+        return cls_score
diff --git a/mmaction/models/heads/x3d_head.py b/mmaction/models/heads/x3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..bee94882950c6da1443b8cc372f58d419bea18be
--- /dev/null
+++ b/mmaction/models/heads/x3d_head.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.model.weight_init import normal_init
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class X3DHead(BaseHead):
+    """Classification head for I3D.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict or ConfigDict): Config for building loss.
+            Default: dict(type='CrossEntropyLoss')
+        spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.
+        dropout_ratio (float): Probability of dropout layer. Default: 0.5.
+        init_std (float): Std value for Initiation. Default: 0.01.
+        fc1_bias (bool): If the first fc layer has bias. Default: False.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+                 spatial_type: str = 'avg',
+                 dropout_ratio: float = 0.5,
+                 init_std: float = 0.01,
+                 fc1_bias: bool = False,
+                 **kwargs) -> None:
+        super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+
+        self.spatial_type = spatial_type
+        self.dropout_ratio = dropout_ratio
+        self.init_std = init_std
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.in_channels = in_channels
+        self.mid_channels = 2048
+        self.num_classes = num_classes
+        self.fc1_bias = fc1_bias
+
+        self.fc1 = nn.Linear(
+            self.in_channels, self.mid_channels, bias=self.fc1_bias)
+        self.fc2 = nn.Linear(self.mid_channels, self.num_classes)
+
+        self.relu = nn.ReLU()
+
+        self.pool = None
+        if self.spatial_type == 'avg':
+            self.pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        elif self.spatial_type == 'max':
+            self.pool = nn.AdaptiveMaxPool3d((1, 1, 1))
+        else:
+            raise NotImplementedError
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        normal_init(self.fc1, std=self.init_std)
+        normal_init(self.fc2, std=self.init_std)
+
+    def forward(self, x: Tensor, **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        # [N, in_channels, T, H, W]
+        assert self.pool is not None
+        x = self.pool(x)
+        # [N, in_channels, 1, 1, 1]
+        # [N, in_channels, 1, 1, 1]
+        x = x.view(x.shape[0], -1)
+        # [N, in_channels]
+        x = self.fc1(x)
+        # [N, 2048]
+        x = self.relu(x)
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        cls_score = self.fc2(x)
+        # [N, num_classes]
+        return cls_score
diff --git a/mmaction/models/localizers/__init__.py b/mmaction/models/localizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f42f0fd18d5ae83e3f2c3f3bf6839da30c2654ba
--- /dev/null
+++ b/mmaction/models/localizers/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bmn import BMN
+from .bsn import PEM, TEM
+from .drn.drn import DRN
+from .tcanet import TCANet
+
+__all__ = ['TEM', 'PEM', 'BMN', 'TCANet', 'DRN']
diff --git a/mmaction/models/localizers/__pycache__/__init__.cpython-312.pyc b/mmaction/models/localizers/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31a9056e0c50d9f178d44612842ec2ce075738e1
Binary files /dev/null and b/mmaction/models/localizers/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/__pycache__/bmn.cpython-312.pyc b/mmaction/models/localizers/__pycache__/bmn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2af215f4d9ac2156cc89a4a101b97d04c414c93
Binary files /dev/null and b/mmaction/models/localizers/__pycache__/bmn.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/__pycache__/bsn.cpython-312.pyc b/mmaction/models/localizers/__pycache__/bsn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d182ab67f796e88020f37e7e1558f40a322ab54c
Binary files /dev/null and b/mmaction/models/localizers/__pycache__/bsn.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/__pycache__/tcanet.cpython-312.pyc b/mmaction/models/localizers/__pycache__/tcanet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d47924a253fbb321c74c6796f3b62a995affa28
Binary files /dev/null and b/mmaction/models/localizers/__pycache__/tcanet.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/bmn.py b/mmaction/models/localizers/bmn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e24e5b16caff45fa29eeacdf8818348d19e43003
--- /dev/null
+++ b/mmaction/models/localizers/bmn.py
@@ -0,0 +1,467 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModel
+
+from mmaction.registry import MODELS
+from .utils import post_processing, temporal_iop, temporal_iou
+
+
+@MODELS.register_module()
+class BMN(BaseModel):
+    """Boundary Matching Network for temporal action proposal generation.
+
+    Please refer `BMN: Boundary-Matching Network for Temporal Action Proposal
+    Generation <https://arxiv.org/abs/1907.09702>`_.
+    Code Reference https://github.com/JJBOY/BMN-Boundary-Matching-Network
+    Args:
+        temporal_dim (int): Total frames selected for each video.
+        boundary_ratio (float): Ratio for determining video boundaries.
+        num_samples (int): Number of samples for each proposal.
+        num_samples_per_bin (int): Number of bin samples for each sample.
+        feat_dim (int): Feature dimension.
+        soft_nms_alpha (float): Soft NMS alpha.
+        soft_nms_low_threshold (float): Soft NMS low threshold.
+        soft_nms_high_threshold (float): Soft NMS high threshold.
+        post_process_top_k (int): Top k proposals in post process.
+        feature_extraction_interval (int):
+            Interval used in feature extraction. Default: 16.
+        loss_cls (dict): Config for building loss.
+            Default: ``dict(type='BMNLoss')``.
+        hidden_dim_1d (int): Hidden dim for 1d conv. Default: 256.
+        hidden_dim_2d (int): Hidden dim for 2d conv. Default: 128.
+        hidden_dim_3d (int): Hidden dim for 3d conv. Default: 512.
+    """
+
+    def __init__(self,
+                 temporal_dim,
+                 boundary_ratio,
+                 num_samples,
+                 num_samples_per_bin,
+                 feat_dim,
+                 soft_nms_alpha,
+                 soft_nms_low_threshold,
+                 soft_nms_high_threshold,
+                 post_process_top_k,
+                 feature_extraction_interval=16,
+                 loss_cls=dict(type='BMNLoss'),
+                 hidden_dim_1d=256,
+                 hidden_dim_2d=128,
+                 hidden_dim_3d=512):
+        super().__init__()
+
+        self.tscale = temporal_dim
+        self.boundary_ratio = boundary_ratio
+        self.num_samples = num_samples
+        self.num_samples_per_bin = num_samples_per_bin
+        self.feat_dim = feat_dim
+        self.soft_nms_alpha = soft_nms_alpha
+        self.soft_nms_low_threshold = soft_nms_low_threshold
+        self.soft_nms_high_threshold = soft_nms_high_threshold
+        self.post_process_top_k = post_process_top_k
+        self.feature_extraction_interval = feature_extraction_interval
+        self.loss_cls = MODELS.build(loss_cls)
+        self.hidden_dim_1d = hidden_dim_1d
+        self.hidden_dim_2d = hidden_dim_2d
+        self.hidden_dim_3d = hidden_dim_3d
+
+        self._get_interp1d_mask()
+
+        # Base Module
+        self.x_1d_b = nn.Sequential(
+            nn.Conv1d(
+                self.feat_dim,
+                self.hidden_dim_1d,
+                kernel_size=3,
+                padding=1,
+                groups=4), nn.ReLU(inplace=True),
+            nn.Conv1d(
+                self.hidden_dim_1d,
+                self.hidden_dim_1d,
+                kernel_size=3,
+                padding=1,
+                groups=4), nn.ReLU(inplace=True))
+
+        # Temporal Evaluation Module
+        self.x_1d_s = nn.Sequential(
+            nn.Conv1d(
+                self.hidden_dim_1d,
+                self.hidden_dim_1d,
+                kernel_size=3,
+                padding=1,
+                groups=4), nn.ReLU(inplace=True),
+            nn.Conv1d(self.hidden_dim_1d, 1, kernel_size=1), nn.Sigmoid())
+        self.x_1d_e = nn.Sequential(
+            nn.Conv1d(
+                self.hidden_dim_1d,
+                self.hidden_dim_1d,
+                kernel_size=3,
+                padding=1,
+                groups=4), nn.ReLU(inplace=True),
+            nn.Conv1d(self.hidden_dim_1d, 1, kernel_size=1), nn.Sigmoid())
+
+        # Proposal Evaluation Module
+        self.x_1d_p = nn.Sequential(
+            nn.Conv1d(
+                self.hidden_dim_1d,
+                self.hidden_dim_1d,
+                kernel_size=3,
+                padding=1), nn.ReLU(inplace=True))
+        self.x_3d_p = nn.Sequential(
+            nn.Conv3d(
+                self.hidden_dim_1d,
+                self.hidden_dim_3d,
+                kernel_size=(self.num_samples, 1, 1)), nn.ReLU(inplace=True))
+        self.x_2d_p = nn.Sequential(
+            nn.Conv2d(self.hidden_dim_3d, self.hidden_dim_2d, kernel_size=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                self.hidden_dim_2d,
+                self.hidden_dim_2d,
+                kernel_size=3,
+                padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(
+                self.hidden_dim_2d,
+                self.hidden_dim_2d,
+                kernel_size=3,
+                padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(self.hidden_dim_2d, 2, kernel_size=1), nn.Sigmoid())
+        self.anchors_tmins, self.anchors_tmaxs = self._temporal_anchors(
+            -0.5, 1.5)
+        self.match_map = self._match_map()
+        # self.bm_mask = self._get_bm_mask()
+        self.register_buffer('bm_mask', self._get_bm_mask())
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        pass
+
+    def forward(self, inputs, data_samples, mode, **kwargs):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes:
+
+        - ``tensor``: Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - ``predict``: Forward and return the predictions, which are fully
+        processed to a list of :obj:`ActionDataSample`.
+        - ``loss``: Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (Tensor): The input tensor with shape
+                (N, C, ...) in general.
+            data_samples (List[:obj:`ActionDataSample`], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to ``tensor``.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of ``ActionDataSample``.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        inputs = torch.stack(inputs)
+        if mode == 'tensor':
+            return self._forward(inputs, **kwargs)
+        if mode == 'predict':
+            return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    def loss(self, batch_inputs, batch_data_samples, **kwargs):
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Raw Inputs of the recognizer.
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`ActionDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_labels``.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        gt_bbox = [
+            sample.gt_instances['gt_bbox'] for sample in batch_data_samples
+        ]
+        label_confidence, label_start, label_end = self.generate_labels(
+            gt_bbox)
+
+        device = batch_inputs.device
+        label_confidence = label_confidence.to(device)
+        label_start = label_start.to(device)
+        label_end = label_end.to(device)
+
+        confidence_map, start, end = self._forward(batch_inputs)
+
+        loss = self.loss_cls(confidence_map, start, end, label_confidence,
+                             label_start, label_end, self.bm_mask)
+        loss_dict = dict(loss=loss[0])
+        return loss_dict
+
+    def predict(self, batch_inputs, batch_data_samples, **kwargs):
+        """Define the computation performed at every call when testing."""
+        confidence_map, start, end = self._forward(batch_inputs)
+        start_scores = start[0].cpu().numpy()
+        end_scores = end[0].cpu().numpy()
+        cls_confidence = (confidence_map[0][1]).cpu().numpy()
+        reg_confidence = (confidence_map[0][0]).cpu().numpy()
+
+        max_start = max(start_scores)
+        max_end = max(end_scores)
+
+        # generate the set of start points and end points
+        start_bins = np.zeros(len(start_scores))
+        start_bins[0] = 1  # [1,0,0...,0,0]
+        end_bins = np.zeros(len(end_scores))
+        end_bins[-1] = 1  # [0,0,0...,0,1]
+        for idx in range(1, self.tscale - 1):
+            if start_scores[idx] > start_scores[
+                    idx + 1] and start_scores[idx] > start_scores[idx - 1]:
+                start_bins[idx] = 1
+            elif start_scores[idx] > (0.5 * max_start):
+                start_bins[idx] = 1
+            if end_scores[idx] > end_scores[
+                    idx + 1] and end_scores[idx] > end_scores[idx - 1]:
+                end_bins[idx] = 1
+            elif end_scores[idx] > (0.5 * max_end):
+                end_bins[idx] = 1
+
+        # iterate through all combinations of start_index and end_index
+        new_proposals = []
+        for idx in range(self.tscale):
+            for jdx in range(self.tscale):
+                start_index = jdx
+                end_index = start_index + idx + 1
+                if end_index < self.tscale and start_bins[
+                        start_index] == 1 and end_bins[end_index] == 1:
+                    tmin = start_index / self.tscale
+                    tmax = end_index / self.tscale
+                    tmin_score = start_scores[start_index]
+                    tmax_score = end_scores[end_index]
+                    cls_score = cls_confidence[idx, jdx]
+                    reg_score = reg_confidence[idx, jdx]
+                    score = tmin_score * tmax_score * cls_score * reg_score
+                    new_proposals.append([
+                        tmin, tmax, tmin_score, tmax_score, cls_score,
+                        reg_score, score
+                    ])
+        new_proposals = np.stack(new_proposals)
+        video_info = batch_data_samples[0].metainfo
+        proposal_list = post_processing(new_proposals, video_info,
+                                        self.soft_nms_alpha,
+                                        self.soft_nms_low_threshold,
+                                        self.soft_nms_high_threshold,
+                                        self.post_process_top_k,
+                                        self.feature_extraction_interval)
+        output = [
+            dict(
+                video_name=video_info['video_name'],
+                proposal_list=proposal_list)
+        ]
+        return output
+
+    @staticmethod
+    def _get_interp1d_bin_mask(seg_tmin, seg_tmax, tscale, num_samples,
+                               num_samples_per_bin):
+        """Generate sample mask for a boundary-matching pair."""
+        plen = float(seg_tmax - seg_tmin)
+        plen_sample = plen / (num_samples * num_samples_per_bin - 1.0)
+        total_samples = [
+            seg_tmin + plen_sample * i
+            for i in range(num_samples * num_samples_per_bin)
+        ]
+        p_mask = []
+        for idx in range(num_samples):
+            bin_samples = total_samples[idx * num_samples_per_bin:(idx + 1) *
+                                        num_samples_per_bin]
+            bin_vector = np.zeros(tscale)
+            for sample in bin_samples:
+                sample_upper = math.ceil(sample)
+                sample_decimal, sample_down = math.modf(sample)
+                if 0 <= int(sample_down) <= (tscale - 1):
+                    bin_vector[int(sample_down)] += 1 - sample_decimal
+                if 0 <= int(sample_upper) <= (tscale - 1):
+                    bin_vector[int(sample_upper)] += sample_decimal
+            bin_vector = 1.0 / num_samples_per_bin * bin_vector
+            p_mask.append(bin_vector)
+        p_mask = np.stack(p_mask, axis=1)
+        return p_mask
+
+    def _get_interp1d_mask(self):
+        """Generate sample mask for each point in Boundary-Matching Map."""
+        mask_mat = []
+        for start_index in range(self.tscale):
+            mask_mat_vector = []
+            for duration_index in range(self.tscale):
+                if start_index + duration_index < self.tscale:
+                    p_tmin = start_index
+                    p_tmax = start_index + duration_index
+                    center_len = float(p_tmax - p_tmin) + 1
+                    sample_tmin = p_tmin - (center_len * self.boundary_ratio)
+                    sample_tmax = p_tmax + (center_len * self.boundary_ratio)
+                    p_mask = self._get_interp1d_bin_mask(
+                        sample_tmin, sample_tmax, self.tscale,
+                        self.num_samples, self.num_samples_per_bin)
+                else:
+                    p_mask = np.zeros([self.tscale, self.num_samples])
+                mask_mat_vector.append(p_mask)
+            mask_mat_vector = np.stack(mask_mat_vector, axis=2)
+            mask_mat.append(mask_mat_vector)
+        mask_mat = np.stack(mask_mat, axis=3)
+        mask_mat = mask_mat.astype(np.float32)
+        self.sample_mask = nn.Parameter(
+            torch.tensor(mask_mat).view(self.tscale, -1), requires_grad=False)
+
+    def _get_bm_mask(self):
+        """Generate Boundary-Matching Mask."""
+        bm_mask = []
+        for idx in range(self.tscale):
+            mask_vector = [1] * (self.tscale - idx) + [0] * idx
+            bm_mask.append(mask_vector)
+        bm_mask = torch.tensor(bm_mask, dtype=torch.float)
+        return bm_mask
+
+    def _match_map(self):
+        """Generate match map."""
+        temporal_gap = 1. / self.tscale
+        match_map = []
+        for idx in range(self.tscale):
+            match_window = []
+            tmin = temporal_gap * idx
+            for jdx in range(1, self.tscale + 1):
+                tmax = tmin + temporal_gap * jdx
+                match_window.append([tmin, tmax])
+            match_map.append(match_window)
+        match_map = np.array(match_map)
+        match_map = np.transpose(match_map, [1, 0, 2])
+        match_map = np.reshape(match_map, [-1, 2])
+        return match_map
+
+    def _temporal_anchors(self, tmin_offset=0., tmax_offset=1.):
+        """Generate temporal anchors.
+
+        Args:
+            tmin_offset (int): Offset for the minimum value of temporal anchor.
+                Default: 0.
+            tmax_offset (int): Offset for the maximum value of temporal anchor.
+                Default: 1.
+        Returns:
+            tuple[Sequence[float]]: The minimum and maximum values of temporal
+                anchors.
+        """
+        temporal_gap = 1. / self.tscale
+        anchors_tmins = []
+        anchors_tmaxs = []
+        for i in range(self.tscale):
+            anchors_tmins.append(temporal_gap * (i + tmin_offset))
+            anchors_tmaxs.append(temporal_gap * (i + tmax_offset))
+
+        return anchors_tmins, anchors_tmaxs
+
+    def _forward(self, x):
+        """Define the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        # x.shape [batch_size, self.feat_dim, self.tscale]
+        base_feature = self.x_1d_b(x)
+        # base_feature.shape [batch_size, self.hidden_dim_1d, self.tscale]
+        start = self.x_1d_s(base_feature).squeeze(1)
+        # start.shape [batch_size, self.tscale]
+        end = self.x_1d_e(base_feature).squeeze(1)
+        # end.shape [batch_size, self.tscale]
+        confidence_map = self.x_1d_p(base_feature)
+        # [batch_size, self.hidden_dim_1d, self.tscale]
+        confidence_map = self._boundary_matching_layer(confidence_map)
+        # [batch_size, self.hidden_dim_1d,, self.num_sampls, self.tscale, self.tscale] # noqa
+        confidence_map = self.x_3d_p(confidence_map).squeeze(2)
+        # [batch_size, self.hidden_dim_3d, self.tscale, self.tscale]
+        confidence_map = self.x_2d_p(confidence_map)
+        # [batch_size, 2, self.tscale, self.tscale]
+
+        return confidence_map, start, end
+
+    def _boundary_matching_layer(self, x):
+        """Generate matching layer."""
+        input_size = x.size()
+        out = torch.matmul(x,
+                           self.sample_mask).reshape(input_size[0],
+                                                     input_size[1],
+                                                     self.num_samples,
+                                                     self.tscale, self.tscale)
+        return out
+
+    def generate_labels(self, gt_bbox):
+        """Generate training labels."""
+        # TODO: do this without numpy
+        match_score_confidence_list = []
+        match_score_start_list = []
+        match_score_end_list = []
+        for every_gt_bbox in gt_bbox:
+            gt_iou_map = []
+            every_gt_bbox = every_gt_bbox.cpu()
+            for start, end in every_gt_bbox:
+                if isinstance(start, torch.Tensor):
+                    start = start.numpy()
+                if isinstance(end, torch.Tensor):
+                    end = end.numpy()
+                current_gt_iou_map = temporal_iou(self.match_map[:, 0],
+                                                  self.match_map[:, 1], start,
+                                                  end)
+                current_gt_iou_map = np.reshape(current_gt_iou_map,
+                                                [self.tscale, self.tscale])
+                gt_iou_map.append(current_gt_iou_map)
+            gt_iou_map = np.array(gt_iou_map).astype(np.float32)
+            gt_iou_map = np.max(gt_iou_map, axis=0)
+
+            gt_tmins = every_gt_bbox[:, 0]
+            gt_tmaxs = every_gt_bbox[:, 1]
+
+            gt_len_pad = 3 * (1. / self.tscale)
+
+            gt_start_bboxs = np.stack(
+                (gt_tmins - gt_len_pad / 2, gt_tmins + gt_len_pad / 2), axis=1)
+            gt_end_bboxs = np.stack(
+                (gt_tmaxs - gt_len_pad / 2, gt_tmaxs + gt_len_pad / 2), axis=1)
+
+            match_score_start = []
+            match_score_end = []
+
+            for anchor_tmin, anchor_tmax in zip(self.anchors_tmins,
+                                                self.anchors_tmaxs):
+                match_score_start.append(
+                    np.max(
+                        temporal_iop(anchor_tmin, anchor_tmax,
+                                     gt_start_bboxs[:, 0], gt_start_bboxs[:,
+                                                                          1])))
+                match_score_end.append(
+                    np.max(
+                        temporal_iop(anchor_tmin, anchor_tmax,
+                                     gt_end_bboxs[:, 0], gt_end_bboxs[:, 1])))
+            match_score_confidence_list.append(gt_iou_map)
+            match_score_start_list.append(match_score_start)
+            match_score_end_list.append(match_score_end)
+
+        def to_tensor(x):
+            return torch.Tensor(np.array(x))
+
+        match_score_confidence_list = to_tensor(match_score_confidence_list)
+        match_score_start_list = to_tensor(match_score_start_list)
+        match_score_end_list = to_tensor(match_score_end_list)
+        return (match_score_confidence_list, match_score_start_list,
+                match_score_end_list)
diff --git a/mmaction/models/localizers/bsn.py b/mmaction/models/localizers/bsn.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f084c7970e4d536a1ba2de03eefb1697dda3ee7
--- /dev/null
+++ b/mmaction/models/localizers/bsn.py
@@ -0,0 +1,506 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModel
+from mmengine.model.weight_init import constant_init, kaiming_init
+
+from mmaction.registry import MODELS
+from .utils import post_processing, temporal_iop
+
+
+@MODELS.register_module()
+class TEM(BaseModel):
+    """Temporal Evaluation Model for Boundary Sensitive Network.
+
+    Please refer `BSN: Boundary Sensitive Network for Temporal Action
+    Proposal Generation <http://arxiv.org/abs/1806.02964>`_.
+    Code reference
+    https://github.com/wzmsltw/BSN-boundary-sensitive-network
+    Args:
+        temporal_dim (int): Total frames selected for each video.
+        tem_feat_dim (int): Feature dimension.
+        tem_hidden_dim (int): Hidden layer dimension.
+        tem_match_threshold (float): Temporal evaluation match threshold.
+        loss_cls (dict): Config for building loss.
+            Default: ``dict(type='BinaryLogisticRegressionLoss')``.
+        loss_weight (float): Weight term for action_loss. Default: 2.
+        output_dim (int): Output dimension. Default: 3.
+        conv1_ratio (float): Ratio of conv1 layer output. Default: 1.0.
+        conv2_ratio (float): Ratio of conv2 layer output. Default: 1.0.
+        conv3_ratio (float): Ratio of conv3 layer output. Default: 0.01.
+    """
+
+    def __init__(self,
+                 temporal_dim,
+                 boundary_ratio,
+                 tem_feat_dim,
+                 tem_hidden_dim,
+                 tem_match_threshold,
+                 loss_cls=dict(type='BinaryLogisticRegressionLoss'),
+                 loss_weight=2,
+                 output_dim=3,
+                 conv1_ratio=1,
+                 conv2_ratio=1,
+                 conv3_ratio=0.01):
+        super().__init__()
+
+        self.temporal_dim = temporal_dim
+        self.boundary_ratio = boundary_ratio
+        self.feat_dim = tem_feat_dim
+        self.c_hidden = tem_hidden_dim
+        self.match_threshold = tem_match_threshold
+        self.output_dim = output_dim
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_weight = loss_weight
+        self.conv1_ratio = conv1_ratio
+        self.conv2_ratio = conv2_ratio
+        self.conv3_ratio = conv3_ratio
+
+        self.conv1 = nn.Conv1d(
+            in_channels=self.feat_dim,
+            out_channels=self.c_hidden,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=1)
+        self.conv2 = nn.Conv1d(
+            in_channels=self.c_hidden,
+            out_channels=self.c_hidden,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=1)
+        self.conv3 = nn.Conv1d(
+            in_channels=self.c_hidden,
+            out_channels=self.output_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.anchors_tmins, self.anchors_tmaxs = self._temporal_anchors()
+
+    def init_weights(self) -> None:
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+
+    def _temporal_anchors(self, tmin_offset=0., tmax_offset=1.):
+        """Generate temporal anchors.
+
+        Args:
+            tmin_offset (int): Offset for the minimum value of temporal anchor.
+                Default: 0.
+            tmax_offset (int): Offset for the maximum value of temporal anchor.
+                Default: 1.
+        Returns:
+            tuple[Sequence[float]]: The minimum and maximum values of temporal
+                anchors.
+        """
+        temporal_gap = 1. / self.temporal_dim
+        anchors_tmins = []
+        anchors_tmaxs = []
+        for i in range(self.temporal_dim):
+            anchors_tmins.append(temporal_gap * (i + tmin_offset))
+            anchors_tmaxs.append(temporal_gap * (i + tmax_offset))
+
+        return anchors_tmins, anchors_tmaxs
+
+    def _forward(self, x):
+        """Define the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        x = F.relu(self.conv1_ratio * self.conv1(x))
+        x = F.relu(self.conv2_ratio * self.conv2(x))
+        x = torch.sigmoid(self.conv3_ratio * self.conv3(x))
+        return x
+
+    def loss(self, batch_inputs, batch_data_samples, **kwargs):
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Raw Inputs of the recognizer.
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`ActionDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_labels``.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        tem_output = self._forward(batch_inputs)
+
+        score_action = tem_output[:, 0, :]
+        score_start = tem_output[:, 1, :]
+        score_end = tem_output[:, 2, :]
+
+        gt_bbox = [
+            sample.gt_instances['gt_bbox'] for sample in batch_data_samples
+        ]
+        label_action, label_start, label_end = self.generate_labels(gt_bbox)
+        device = batch_inputs.device
+        label_action = label_action.to(device)
+        label_start = label_start.to(device)
+        label_end = label_end.to(device)
+
+        loss_action = self.loss_cls(score_action, label_action,
+                                    self.match_threshold)
+        loss_start = self.loss_cls(score_start, label_start,
+                                   self.match_threshold)
+        loss_end = self.loss_cls(score_end, label_end, self.match_threshold)
+
+        loss_dict = {
+            'loss_action': loss_action * self.loss_weight,
+            'loss_start': loss_start,
+            'loss_end': loss_end
+        }
+
+        return loss_dict
+
+    def predict(self, batch_inputs, batch_data_samples, **kwargs):
+        """Define the computation performed at every call when testing."""
+        tem_output = self._forward(batch_inputs).cpu().numpy()
+        batch_action = tem_output[:, 0, :]
+        batch_start = tem_output[:, 1, :]
+        batch_end = tem_output[:, 2, :]
+
+        video_results = []
+        for batch_idx, _ in enumerate(batch_action):
+            video_name = batch_data_samples[batch_idx].metainfo['video_name']
+            video_action = batch_action[batch_idx]
+            video_start = batch_start[batch_idx]
+            video_end = batch_end[batch_idx]
+            video_result = np.stack((video_action, video_start, video_end,
+                                     self.anchors_tmins, self.anchors_tmaxs),
+                                    axis=1)
+            video_results.append((video_name, video_result))
+        return video_results
+
+    def generate_labels(self, gt_bbox):
+        """Generate training labels."""
+        # TODO: do this without numpy
+        match_score_action_list = []
+        match_score_start_list = []
+        match_score_end_list = []
+        for every_gt_bbox in gt_bbox:
+            gt_tmins = every_gt_bbox[:, 0].cpu().numpy()
+            gt_tmaxs = every_gt_bbox[:, 1].cpu().numpy()
+
+            gt_lens = gt_tmaxs - gt_tmins
+            gt_len_pad = np.maximum(1. / self.temporal_dim,
+                                    self.boundary_ratio * gt_lens)
+
+            gt_start_bboxs = np.stack(
+                (gt_tmins - gt_len_pad / 2, gt_tmins + gt_len_pad / 2), axis=1)
+            gt_end_bboxs = np.stack(
+                (gt_tmaxs - gt_len_pad / 2, gt_tmaxs + gt_len_pad / 2), axis=1)
+
+            match_score_action = []
+            match_score_start = []
+            match_score_end = []
+
+            for anchor_tmin, anchor_tmax in zip(self.anchors_tmins,
+                                                self.anchors_tmaxs):
+                match_score_action.append(
+                    np.max(
+                        temporal_iop(anchor_tmin, anchor_tmax, gt_tmins,
+                                     gt_tmaxs)))
+                match_score_start.append(
+                    np.max(
+                        temporal_iop(anchor_tmin, anchor_tmax,
+                                     gt_start_bboxs[:, 0], gt_start_bboxs[:,
+                                                                          1])))
+                match_score_end.append(
+                    np.max(
+                        temporal_iop(anchor_tmin, anchor_tmax,
+                                     gt_end_bboxs[:, 0], gt_end_bboxs[:, 1])))
+            match_score_action_list.append(match_score_action)
+            match_score_start_list.append(match_score_start)
+            match_score_end_list.append(match_score_end)
+        match_score_action_list = torch.Tensor(match_score_action_list)
+        match_score_start_list = torch.Tensor(match_score_start_list)
+        match_score_end_list = torch.Tensor(match_score_end_list)
+        return (match_score_action_list, match_score_start_list,
+                match_score_end_list)
+
+    def forward(self, inputs, data_samples, mode, **kwargs):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes:
+
+        - ``tensor``: Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - ``predict``: Forward and return the predictions, which are fully
+        processed to a list of :obj:`ActionDataSample`.
+        - ``loss``: Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (Tensor): The input tensor with shape
+                (N, C, ...) in general.
+            data_samples (List[:obj:`ActionDataSample`], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to ``tensor``.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of ``ActionDataSample``.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if type(inputs) is not torch.Tensor:
+            inputs = torch.stack(inputs)
+
+        if mode == 'tensor':
+            return self._forward(inputs, **kwargs)
+        if mode == 'predict':
+            return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+
+@MODELS.register_module()
+class PEM(BaseModel):
+    """Proposals Evaluation Model for Boundary Sensitive Network.
+
+    Please refer `BSN: Boundary Sensitive Network for Temporal Action
+    Proposal Generation <http://arxiv.org/abs/1806.02964>`_.
+    Code reference
+    https://github.com/wzmsltw/BSN-boundary-sensitive-network
+    Args:
+        pem_feat_dim (int): Feature dimension.
+        pem_hidden_dim (int): Hidden layer dimension.
+        pem_u_ratio_m (float): Ratio for medium score proprosals to balance
+            data.
+        pem_u_ratio_l (float): Ratio for low score proprosals to balance data.
+        pem_high_temporal_iou_threshold (float): High IoU threshold.
+        pem_low_temporal_iou_threshold (float): Low IoU threshold.
+        soft_nms_alpha (float): Soft NMS alpha.
+        soft_nms_low_threshold (float): Soft NMS low threshold.
+        soft_nms_high_threshold (float): Soft NMS high threshold.
+        post_process_top_k (int): Top k proposals in post process.
+        feature_extraction_interval (int):
+            Interval used in feature extraction. Default: 16.
+        fc1_ratio (float): Ratio for fc1 layer output. Default: 0.1.
+        fc2_ratio (float): Ratio for fc2 layer output. Default: 0.1.
+        output_dim (int): Output dimension. Default: 1.
+    """
+
+    def __init__(self,
+                 pem_feat_dim: int,
+                 pem_hidden_dim: int,
+                 pem_u_ratio_m: float,
+                 pem_u_ratio_l: float,
+                 pem_high_temporal_iou_threshold: float,
+                 pem_low_temporal_iou_threshold: float,
+                 soft_nms_alpha: float,
+                 soft_nms_low_threshold: float,
+                 soft_nms_high_threshold: float,
+                 post_process_top_k: int,
+                 feature_extraction_interval: int = 16,
+                 fc1_ratio: float = 0.1,
+                 fc2_ratio: float = 0.1,
+                 output_dim: int = 1):
+        super().__init__()
+
+        self.feat_dim = pem_feat_dim
+        self.hidden_dim = pem_hidden_dim
+        self.u_ratio_m = pem_u_ratio_m
+        self.u_ratio_l = pem_u_ratio_l
+        self.pem_high_temporal_iou_threshold = pem_high_temporal_iou_threshold
+        self.pem_low_temporal_iou_threshold = pem_low_temporal_iou_threshold
+        self.soft_nms_alpha = soft_nms_alpha
+        self.soft_nms_low_threshold = soft_nms_low_threshold
+        self.soft_nms_high_threshold = soft_nms_high_threshold
+        self.post_process_top_k = post_process_top_k
+        self.feature_extraction_interval = feature_extraction_interval
+        self.fc1_ratio = fc1_ratio
+        self.fc2_ratio = fc2_ratio
+        self.output_dim = output_dim
+
+        self.fc1 = nn.Linear(
+            in_features=self.feat_dim, out_features=self.hidden_dim, bias=True)
+        self.fc2 = nn.Linear(
+            in_features=self.hidden_dim,
+            out_features=self.output_dim,
+            bias=True)
+
+    def init_weights(self) -> None:
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+
+    def _forward(self, x):
+        """Define the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        x = F.relu(self.fc1_ratio * self.fc1(x))
+        x = torch.sigmoid(self.fc2_ratio * self.fc2(x))
+        return x
+
+    def loss(self, batch_inputs, batch_data_samples, **kwargs):
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Raw Inputs of the recognizer.
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`ActionDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_labels``.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        device = self.fc1.weight.device
+
+        bsp_feature = torch.cat([
+            sample.gt_instances['bsp_feature'] for sample in batch_data_samples
+        ]).to(device)
+
+        reference_temporal_iou = torch.cat([
+            sample.gt_instances['reference_temporal_iou']
+            for sample in batch_data_samples
+        ]).to(device)
+
+        pem_output = self._forward(bsp_feature)
+
+        anchors_temporal_iou = pem_output.view(-1)
+        u_hmask = (reference_temporal_iou >
+                   self.pem_high_temporal_iou_threshold).float()
+        u_mmask = (
+            (reference_temporal_iou <= self.pem_high_temporal_iou_threshold)
+            & (reference_temporal_iou > self.pem_low_temporal_iou_threshold)
+        ).float()
+        u_lmask = (reference_temporal_iou <=
+                   self.pem_low_temporal_iou_threshold).float()
+
+        num_h = torch.sum(u_hmask)
+        num_m = torch.sum(u_mmask)
+        num_l = torch.sum(u_lmask)
+
+        r_m = self.u_ratio_m * num_h / (num_m)
+        r_m = torch.min(r_m, torch.Tensor([1.0]).to(device))[0]
+        u_smmask = torch.rand(u_hmask.size()[0], device=device)
+        u_smmask = u_smmask * u_mmask
+        u_smmask = (u_smmask > (1. - r_m)).float()
+
+        r_l = self.u_ratio_l * num_h / (num_l)
+        r_l = torch.min(r_l, torch.Tensor([1.0]).to(device))[0]
+        u_slmask = torch.rand(u_hmask.size()[0], device=device)
+        u_slmask = u_slmask * u_lmask
+        u_slmask = (u_slmask > (1. - r_l)).float()
+
+        temporal_iou_weights = u_hmask + u_smmask + u_slmask
+        temporal_iou_loss = F.smooth_l1_loss(anchors_temporal_iou,
+                                             reference_temporal_iou)
+        temporal_iou_loss = torch.sum(
+            temporal_iou_loss *
+            temporal_iou_weights) / torch.sum(temporal_iou_weights)
+        loss_dict = dict(temporal_iou_loss=temporal_iou_loss)
+
+        return loss_dict
+
+    def _parse(self, gt_instances, key):
+        out = torch.cat([gt[key] for gt in gt_instances])
+        out = out.view(-1).cpu().numpy().reshape(-1, 1)
+        return out
+
+    def predict(self, batch_inputs, batch_data_samples, **kwargs):
+        """Define the computation performed at every call when testing."""
+        device = self.fc1.weight.device
+
+        bsp_feature = torch.cat([
+            sample.gt_instances['bsp_feature'] for sample in batch_data_samples
+        ]).to(device)
+
+        pem_output = self._forward(bsp_feature).view(-1).cpu().numpy()
+        pem_output = pem_output.reshape(-1, 1)
+
+        gt_instances = [sample.gt_instances for sample in batch_data_samples]
+
+        tmin = self._parse(gt_instances, 'tmin')
+        tmax = self._parse(gt_instances, 'tmax')
+        tmin_score = self._parse(gt_instances, 'tmin_score')
+        tmax_score = self._parse(gt_instances, 'tmax_score')
+
+        score = np.array(pem_output * tmin_score * tmax_score).reshape(-1, 1)
+        result = np.concatenate(
+            (tmin, tmax, tmin_score, tmax_score, pem_output, score), axis=1)
+        result = result.reshape(-1, 6)
+
+        video_info = batch_data_samples[0].metainfo
+        proposal_list = post_processing(result, video_info,
+                                        self.soft_nms_alpha,
+                                        self.soft_nms_low_threshold,
+                                        self.soft_nms_high_threshold,
+                                        self.post_process_top_k,
+                                        self.feature_extraction_interval)
+        output = [
+            dict(
+                video_name=video_info['video_name'],
+                proposal_list=proposal_list)
+        ]
+        return output
+
+    def forward(self, inputs, data_samples, mode, **kwargs):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes:
+
+        - ``tensor``: Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - ``predict``: Forward and return the predictions, which are fully
+        processed to a list of :obj:`ActionDataSample`.
+        - ``loss``: Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            batch_inputs (Tensor): The input tensor with shape
+                (N, C, ...) in general.
+            batch_data_samples (List[:obj:`ActionDataSample`], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to ``tensor``.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of ``ActionDataSample``.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        inputs = torch.stack(inputs)
+        if mode == 'tensor':
+            return self._forward(inputs, **kwargs)
+        if mode == 'predict':
+            return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
diff --git a/mmaction/models/localizers/drn/__pycache__/drn.cpython-312.pyc b/mmaction/models/localizers/drn/__pycache__/drn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9191ad8e021d5968d8608f98b2f043c5b369f42d
Binary files /dev/null and b/mmaction/models/localizers/drn/__pycache__/drn.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/drn/drn.py b/mmaction/models/localizers/drn/drn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6af4a55bd205e7922778bfcb77995280de92f4
--- /dev/null
+++ b/mmaction/models/localizers/drn/drn.py
@@ -0,0 +1,260 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModel
+
+from mmaction.registry import MODELS
+from mmaction.utils import OptConfigType
+from ..utils import soft_nms
+from .drn_utils import FPN, Backbone, FCOSModule, QueryEncoder
+
+
+@MODELS.register_module()
+class DRN(BaseModel):
+    """Dense Regression Network for Video Grounding.
+
+    Please refer `Dense Regression Network for Video Grounding
+        <https://arxiv.org/abs/2103.13141>`_.
+    Code Reference: https://github.com/Alvin-Zeng/DRN
+
+    Args:
+        vocab_size (int): number of all possible words in the query.
+            Defaults to 1301.
+        hidden_dim (int): the hidden dimension of the LSTM in the
+            language model. Defaults to 512.
+        embed_dim (int): the embedding dimension of the query. Defaults
+            to 300.
+        bidirection (bool): if True, use bi-direction LSTM in the
+            language model. Defaults to True.
+        first_output_dim (int): the output dimension of the first layer
+            in the backbone. Defaults to 256.
+        fpn_feature_dim (int): the output dimension of the FPN. Defaults
+            to 512.
+        feature_dim (int): the dimension of the video clip feature.
+        lstm_layers (int): the number of LSTM layers in the language model.
+            Defaults to 1.
+        fcos_pre_nms_top_n (int): value of Top-N in the FCOS module before
+            nms.  Defaults to 32.
+        fcos_inference_thr (float): threshold in the FOCS inference. BBoxes
+            with scores higher than this threshold are regarded as positive.
+            Defaults to 0.05.
+        fcos_prior_prob (float): A prior probability of the positive bboexes.
+            Used to initialized the bias of the classification head.
+            Defaults to 0.01.
+        focal_alpha (float):Focal loss hyper-parameter alpha.
+            Defaults to 0.25.
+        focal_gamma (float): Focal loss hyper-parameter gamma.
+            Defaults to 2.0.
+        fpn_stride (Sequence[int]): the strides in the FPN. Defaults to
+            [1, 2, 4].
+        fcos_nms_thr (float): NMS threshold in the FOCS module.
+            Defaults to 0.6.
+        fcos_conv_layers (int): number of convolution layers in FCOS.
+            Defaults to 1.
+        fcos_num_class (int): number of classes in FCOS.
+            Defaults to 2.
+        is_first_stage (bool): if true, the model is in the first stage
+            training.
+        is_second_stage (bool): if true, the model is in the second stage
+            training.
+    """
+
+    def __init__(self,
+                 vocab_size: int = 1301,
+                 hidden_dim: int = 512,
+                 embed_dim: int = 300,
+                 bidirection: bool = True,
+                 first_output_dim: int = 256,
+                 fpn_feature_dim: int = 512,
+                 feature_dim: int = 4096,
+                 lstm_layers: int = 1,
+                 fcos_pre_nms_top_n: int = 32,
+                 fcos_inference_thr: float = 0.05,
+                 fcos_prior_prob: float = 0.01,
+                 focal_alpha: float = 0.25,
+                 focal_gamma: float = 2.0,
+                 fpn_stride: Sequence[int] = [1, 2, 4],
+                 fcos_nms_thr: float = 0.6,
+                 fcos_conv_layers: int = 1,
+                 fcos_num_class: int = 2,
+                 is_first_stage: bool = False,
+                 is_second_stage: bool = False,
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super(DRN, self).__init__(init_cfg)
+
+        self.query_encoder = QueryEncoder(
+            vocab_size=vocab_size,
+            hidden_dim=hidden_dim,
+            embed_dim=embed_dim,
+            num_layers=lstm_layers,
+            bidirection=bidirection)
+
+        channels_list = [
+            (feature_dim + 256, first_output_dim, 3, 1),
+            (first_output_dim, first_output_dim * 2, 3, 2),
+            (first_output_dim * 2, first_output_dim * 4, 3, 2),
+        ]
+        self.backbone_net = Backbone(channels_list)
+
+        self.fpn = FPN(
+            in_channels_list=[256, 512, 1024], out_channels=fpn_feature_dim)
+
+        self.fcos = FCOSModule(
+            in_channels=fpn_feature_dim,
+            fcos_num_class=fcos_num_class,
+            fcos_conv_layers=fcos_conv_layers,
+            fcos_prior_prob=fcos_prior_prob,
+            fcos_inference_thr=fcos_inference_thr,
+            fcos_pre_nms_top_n=fcos_pre_nms_top_n,
+            fcos_nms_thr=fcos_nms_thr,
+            test_detections_per_img=32,
+            fpn_stride=fpn_stride,
+            focal_alpha=focal_alpha,
+            focal_gamma=focal_gamma,
+            is_first_stage=is_first_stage,
+            is_second_stage=is_second_stage)
+
+        self.prop_fc = nn.Linear(feature_dim, feature_dim)
+        self.position_transform = nn.Linear(3, 256)
+
+        qInput = []
+        for t in range(len(channels_list)):
+            if t > 0:
+                qInput += [nn.Linear(1024, channels_list[t - 1][1])]
+            else:
+                qInput += [nn.Linear(1024, feature_dim)]
+        self.qInput = nn.ModuleList(qInput)
+
+        self.is_second_stage = is_second_stage
+
+    def forward(self, inputs, data_samples, mode, **kwargs):
+        props_features = torch.stack(inputs)
+        batch_size = props_features.shape[0]
+        device = props_features.device
+        proposals = torch.stack([
+            sample.proposals['proposals'] for sample in data_samples
+        ]).to(device)
+        gt_bbox = torch.stack([
+            sample.gt_instances['gt_bbox'] for sample in data_samples
+        ]).to(device)
+
+        video_info = [i.metainfo for i in data_samples]
+        query_tokens_ = [i['query_tokens'] for i in video_info]
+        query_length = [i['query_length'] for i in video_info]
+        query_length = torch.from_numpy(np.array(query_length))
+
+        max_query_len = max([i.shape[0] for i in query_tokens_])
+        query_tokens = torch.zeros(batch_size, max_query_len)
+        for idx, query_token in enumerate(query_tokens_):
+            query_len = query_token.shape[0]
+            query_tokens[idx, :query_len] = query_token
+
+        query_tokens = query_tokens.to(device).long()
+        query_length = query_length.to(device).long()  # should be on CPU!
+
+        sort_index = query_length.argsort(descending=True)
+        box_lists, loss_dict = self._forward(query_tokens[sort_index],
+                                             query_length[sort_index],
+                                             props_features[sort_index],
+                                             proposals[sort_index],
+                                             gt_bbox[sort_index])
+        if mode == 'loss':
+            return loss_dict
+        elif mode == 'predict':
+            # only support batch size = 1
+            bbox = box_lists[0]
+
+            per_vid_detections = bbox['detections']
+            per_vid_scores = bbox['scores']
+
+            props_pred = torch.cat(
+                (per_vid_detections, per_vid_scores.unsqueeze(-1)), dim=-1)
+
+            props_pred = props_pred.cpu().numpy()
+            props_pred = sorted(props_pred, key=lambda x: x[-1], reverse=True)
+            props_pred = np.array(props_pred)
+
+            props_pred = soft_nms(
+                props_pred,
+                alpha=0.4,
+                low_threshold=0.5,
+                high_threshold=0.9,
+                top_k=5)
+            result = {
+                'vid_name': data_samples[0].metainfo['vid_name'],
+                'gt': gt_bbox[0].cpu().numpy(),
+                'predictions': props_pred,
+            }
+            return [result]
+
+        raise ValueError(f'Unsupported mode {mode}!')
+
+    def nms_temporal(self, start, end, score, overlap=0.45):
+        pick = []
+        assert len(start) == len(score)
+        assert len(end) == len(score)
+        if len(start) == 0:
+            return pick
+
+        union = end - start
+        # sort and get index
+        intervals = [
+            i[0] for i in sorted(enumerate(score), key=lambda x: x[1])
+        ]
+
+        while len(intervals) > 0:
+            i = intervals[-1]
+            pick.append(i)
+
+            xx1 = [max(start[i], start[j]) for j in intervals[:-1]]
+            xx2 = [min(end[i], end[j]) for j in intervals[:-1]]
+            inter = [max(0., k2 - k1) for k1, k2 in zip(xx1, xx2)]
+            o = [
+                inter[u] / (union[i] + union[intervals[u]] - inter[u])
+                for u in range(len(intervals) - 1)
+            ]
+            I_new = []
+            for j in range(len(o)):
+                if o[j] <= overlap:
+                    I_new.append(intervals[j])
+            intervals = I_new
+        return np.array(pick)
+
+    def _forward(self, query_tokens, query_length, props_features,
+                 props_start_end, gt_bbox):
+
+        position_info = [props_start_end, props_start_end]
+        position_feats = []
+        query_features = self.query_encoder(query_tokens, query_length)
+        for i in range(len(query_features)):
+            query_features[i] = self.qInput[i](query_features[i])
+            if i > 1:
+                position_info.append(
+                    torch.cat([
+                        props_start_end[:, ::2 * (i - 1), [0]],
+                        props_start_end[:, 1::2 * (i - 1), [1]]
+                    ],
+                              dim=-1))
+            props_duration = position_info[i][:, :, 1] - position_info[i][:, :,
+                                                                          0]
+            props_duration = props_duration.unsqueeze(-1)
+            position_feat = torch.cat((position_info[i], props_duration),
+                                      dim=-1).float()
+            position_feats.append(
+                self.position_transform(position_feat).permute(0, 2, 1))
+
+        props_features = self.prop_fc(props_features)
+
+        inputs = props_features.permute(0, 2, 1)
+        outputs = self.backbone_net(inputs, query_features, position_feats)
+        outputs = self.fpn(outputs)
+
+        if self.is_second_stage:
+            outputs = [_.detach() for _ in outputs]
+        box_lists, loss_dict = self.fcos(outputs, gt_bbox.float())
+
+        return box_lists, loss_dict
diff --git a/mmaction/models/localizers/drn/drn_utils/FPN.py b/mmaction/models/localizers/drn/drn_utils/FPN.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bf1b9fcbb9b4a81ee523ffba0d961530228733a
--- /dev/null
+++ b/mmaction/models/localizers/drn/drn_utils/FPN.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from .backbone import conv_block
+
+
+class FPN(nn.Module):
+
+    def __init__(self, in_channels_list: List, out_channels: int) -> None:
+        super(FPN, self).__init__()
+
+        inner_blocks = []
+        layer_blocks = []
+        for idx, in_channels in enumerate(in_channels_list, 1):
+            inner_block = conv_block(in_channels, out_channels, 1, 1)
+            layer_block = conv_block(out_channels, out_channels, 3, 1)
+
+            inner_blocks.append(inner_block)
+            layer_blocks.append(layer_block)
+
+        self.inner_blocks = nn.ModuleList(inner_blocks)
+        self.layer_blocks = nn.ModuleList(layer_blocks)
+
+    def forward(self, x: Tensor) -> Tuple[Tensor]:
+        # process the last lowest resolution feat and
+        # first feed it into 1 x 1 conv
+        last_inner = self.inner_blocks[-1](x[-1])
+        results = [self.layer_blocks[-1](last_inner)]
+
+        for feature, inner_block, layer_block in zip(
+                x[:-1][::-1], self.inner_blocks[:-1][::-1],
+                self.layer_blocks[:-1][::-1]):
+            if not inner_block:
+                continue
+            inner_top_down = F.interpolate(
+                last_inner, scale_factor=2, mode='nearest')
+            inner_lateral = inner_block(feature)
+            last_inner = inner_lateral + inner_top_down
+            results.insert(0, layer_block(last_inner))
+
+        return tuple(results)
diff --git a/mmaction/models/localizers/drn/drn_utils/__init__.py b/mmaction/models/localizers/drn/drn_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..863f5fbed17032f97a6ae0896e4d23a8d26ef16d
--- /dev/null
+++ b/mmaction/models/localizers/drn/drn_utils/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backbone import Backbone
+from .fcos import FCOSModule
+from .FPN import FPN
+from .language_module import QueryEncoder
+
+__all__ = ['Backbone', 'FPN', 'QueryEncoder', 'FCOSModule']
diff --git a/mmaction/models/localizers/drn/drn_utils/__pycache__/FPN.cpython-312.pyc b/mmaction/models/localizers/drn/drn_utils/__pycache__/FPN.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12d9b5f742aaef11b3136351bf0e7d250d33c97f
Binary files /dev/null and b/mmaction/models/localizers/drn/drn_utils/__pycache__/FPN.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/drn/drn_utils/__pycache__/__init__.cpython-312.pyc b/mmaction/models/localizers/drn/drn_utils/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4dc695f7ce987ffe2c0f732a7ff860caf657417a
Binary files /dev/null and b/mmaction/models/localizers/drn/drn_utils/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/drn/drn_utils/__pycache__/backbone.cpython-312.pyc b/mmaction/models/localizers/drn/drn_utils/__pycache__/backbone.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06065a676065a7c81cdf2f52f9f06262ae213118
Binary files /dev/null and b/mmaction/models/localizers/drn/drn_utils/__pycache__/backbone.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/drn/drn_utils/__pycache__/fcos.cpython-312.pyc b/mmaction/models/localizers/drn/drn_utils/__pycache__/fcos.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf99ca1f2cf44f20ffdcc053d0a2d8b535e614c1
Binary files /dev/null and b/mmaction/models/localizers/drn/drn_utils/__pycache__/fcos.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/drn/drn_utils/__pycache__/inference.cpython-312.pyc b/mmaction/models/localizers/drn/drn_utils/__pycache__/inference.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf37db5b949e7ce256dcc9b22356599d165b5917
Binary files /dev/null and b/mmaction/models/localizers/drn/drn_utils/__pycache__/inference.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/drn/drn_utils/__pycache__/language_module.cpython-312.pyc b/mmaction/models/localizers/drn/drn_utils/__pycache__/language_module.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f5604ca50c736a5a33479f73a62becbe1201cf2
Binary files /dev/null and b/mmaction/models/localizers/drn/drn_utils/__pycache__/language_module.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/drn/drn_utils/__pycache__/loss.cpython-312.pyc b/mmaction/models/localizers/drn/drn_utils/__pycache__/loss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89e7e2b822862d76acb1c96ee4905786c6ff5bb2
Binary files /dev/null and b/mmaction/models/localizers/drn/drn_utils/__pycache__/loss.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/drn/drn_utils/backbone.py b/mmaction/models/localizers/drn/drn_utils/backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..5745e14d58b0cde5f5a2d4ec0bd01bb71d4a99e7
--- /dev/null
+++ b/mmaction/models/localizers/drn/drn_utils/backbone.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from torch import Tensor, nn
+
+
+def conv_block(in_channels: int,
+               out_channels: int,
+               kernel_size: int = 3,
+               stride: int = 1) -> nn.Module:
+    module = nn.Sequential(
+        nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            bias=False), nn.BatchNorm1d(out_channels), nn.ReLU())
+    return module
+
+
+class Backbone(nn.Module):
+
+    def __init__(self, channels_list: List[tuple]) -> None:
+        super(Backbone, self).__init__()
+
+        self.num_layers = len(channels_list)
+        layers = []
+        for idx, channels_config in enumerate(channels_list):
+            layer = conv_block(*channels_config)
+            layers.append(layer)
+        self.layers = nn.ModuleList(layers)
+
+    def forward(self, x: Tensor, query_fts: Tensor,
+                position_fts: Tensor) -> Tuple[Tensor]:
+        results = []
+
+        for idx in range(self.num_layers):
+            query_ft = query_fts[idx].unsqueeze(1).permute(0, 2, 1)
+            position_ft = position_fts[idx]
+            x = query_ft * x
+            if idx == 0:
+                x = torch.cat([x, position_ft], dim=1)
+            x = self.layers[idx](x)
+            results.append(x)
+
+        return tuple(results)
diff --git a/mmaction/models/localizers/drn/drn_utils/fcos.py b/mmaction/models/localizers/drn/drn_utils/fcos.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c33c18aaf36331848467e04e8a491405de98a6c
--- /dev/null
+++ b/mmaction/models/localizers/drn/drn_utils/fcos.py
@@ -0,0 +1,192 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from torch import nn
+
+from .inference import make_fcos_postprocessor
+from .loss import make_fcos_loss_evaluator
+
+
+class Scale(nn.Module):
+
+    def __init__(self, init_value=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.FloatTensor([init_value]))
+
+    def forward(self, x):
+        return x * self.scale
+
+
+class FCOSHead(torch.nn.Module):
+
+    def __init__(self, in_channels: int, fcos_num_class: int,
+                 fcos_conv_layers: int, fcos_prior_prob: float,
+                 is_second_stage: bool) -> None:
+        super(FCOSHead, self).__init__()
+        num_classes = fcos_num_class - 1
+
+        cls_tower = []
+        bbox_tower = []
+        for i in range(fcos_conv_layers):
+            cls_tower.append(
+                nn.Conv1d(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1))
+            cls_tower.append(nn.BatchNorm1d(in_channels))
+            cls_tower.append(nn.ReLU())
+            bbox_tower.append(
+                nn.Conv1d(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1))
+            bbox_tower.append(nn.BatchNorm1d(in_channels))
+            bbox_tower.append(nn.ReLU())
+
+        self.cls_tower = nn.Sequential(*cls_tower)
+        self.bbox_tower = nn.Sequential(*bbox_tower)
+        self.cls_logits = nn.Conv1d(
+            in_channels, num_classes, kernel_size=3, stride=1, padding=1)
+
+        self.bbox_pred = nn.Conv1d(
+            in_channels, 2, kernel_size=3, stride=1, padding=1)
+
+        self.mix_fc = nn.Sequential(
+            nn.Conv1d(2 * in_channels, in_channels, kernel_size=1, stride=1),
+            nn.BatchNorm1d(in_channels), nn.ReLU())
+
+        self.iou_scores = nn.Sequential(
+            nn.Conv1d(
+                in_channels,
+                in_channels // 2,
+                kernel_size=3,
+                stride=1,
+                padding=1),
+            nn.BatchNorm1d(in_channels // 2),
+            nn.ReLU(),
+            nn.Conv1d(in_channels // 2, 1, kernel_size=1, stride=1),
+        )
+
+        # initialization
+        for module in self.modules():
+            if isinstance(module, nn.Conv1d):
+                torch.nn.init.normal_(module.weight, std=0.01)
+                torch.nn.init.constant_(module.bias, 0)
+
+        # initialize the bias for focal loss
+        bias_value = -math.log((1 - fcos_prior_prob) / fcos_prior_prob)
+        torch.nn.init.constant_(self.cls_logits.bias, bias_value)
+
+        self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(3)])
+        self.is_second_stage = is_second_stage
+
+    def forward(self, x):
+        logits = []
+        bbox_reg = []
+        iou_scores = []
+        for idx, feature in enumerate(x):
+            cls_tower = self.cls_tower(feature)
+            box_tower = self.bbox_tower(feature)
+            logits.append(self.cls_logits(cls_tower))
+
+            bbox_reg_ = torch.exp(self.scales[idx](self.bbox_pred(box_tower)))
+            if self.is_second_stage:
+                bbox_reg_ = bbox_reg_.detach()
+            bbox_reg.append(bbox_reg_)
+
+            mix_feature = torch.cat([cls_tower, box_tower], dim=1)
+            if self.is_second_stage:
+                mix_feature = mix_feature.detach()
+            mix_feature = self.mix_fc(mix_feature)
+            iou_scores.append(self.iou_scores(mix_feature))
+        return logits, bbox_reg, iou_scores
+
+
+class FCOSModule(torch.nn.Module):
+
+    def __init__(self, in_channels: int, fcos_num_class: int,
+                 fcos_conv_layers: int, fcos_prior_prob: float,
+                 fcos_inference_thr: float, fcos_pre_nms_top_n: int,
+                 fcos_nms_thr: float, test_detections_per_img: int,
+                 fpn_stride: int, focal_alpha: float, focal_gamma: float,
+                 is_first_stage: bool, is_second_stage: bool) -> None:
+        super(FCOSModule, self).__init__()
+
+        head = FCOSHead(
+            in_channels=in_channels,
+            fcos_num_class=fcos_num_class,
+            fcos_conv_layers=fcos_conv_layers,
+            fcos_prior_prob=fcos_prior_prob,
+            is_second_stage=is_second_stage)
+
+        self.is_first_stage = is_first_stage
+        self.is_second_stage = is_second_stage
+        box_selector_test = make_fcos_postprocessor(fcos_num_class,
+                                                    fcos_inference_thr,
+                                                    fcos_pre_nms_top_n,
+                                                    fcos_nms_thr,
+                                                    test_detections_per_img,
+                                                    is_first_stage)
+        loss_evaluator = make_fcos_loss_evaluator(focal_alpha, focal_gamma)
+        self.head = head
+        self.box_selector_test = box_selector_test
+        self.loss_evaluator = loss_evaluator
+        self.fpn_strides = fpn_stride
+
+    def forward(self, features, targets=None):
+        box_cls, box_regression, iou_scores = self.head(features)
+        locations = self.compute_locations(features)
+
+        if self.training:
+            return self._forward_train(locations, box_cls, box_regression,
+                                       targets, iou_scores)
+        else:
+            return self._forward_test(locations, box_cls, box_regression,
+                                      targets, iou_scores)
+
+    def _forward_train(self, locations, box_cls, box_regression, targets,
+                       iou_scores):
+        loss_box_cls, loss_box_reg, loss_iou = self.loss_evaluator(
+            locations, box_cls, box_regression, targets, iou_scores,
+            self.is_first_stage)
+
+        if self.is_second_stage:
+            loss_box_cls = loss_box_cls.detach()
+            loss_box_reg = loss_box_reg.detach()
+        if self.is_first_stage:
+            loss_iou = loss_iou.detach()
+
+        losses = {
+            'loss_cls': loss_box_cls,
+            'loss_reg': loss_box_reg,
+            'loss_iou': loss_iou
+        }
+        return None, losses
+
+    def _forward_test(self, locations, box_cls, box_regression, targets,
+                      iou_scores):
+        boxes = self.box_selector_test(locations, box_cls, box_regression,
+                                       iou_scores)
+        losses = None
+        return boxes, losses
+
+    def compute_locations(self, features):
+        locations = []
+        for level, feature in enumerate(features):
+            t = feature.size(-1)
+            locations_per_level = self.compute_locations_per_level(
+                t, self.fpn_strides[level], feature.device)
+            locations.append(locations_per_level)
+        return locations
+
+    def compute_locations_per_level(self, t, stride, device):
+        shifts_t = torch.arange(
+            0, t * stride, step=stride, dtype=torch.float32, device=device)
+        shifts_t = shifts_t.reshape(-1)
+        locations = shifts_t + stride / 2
+        return locations
diff --git a/mmaction/models/localizers/drn/drn_utils/inference.py b/mmaction/models/localizers/drn/drn_utils/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cd1b3156a0a6ce5eb19f5b83072519a3ae58756
--- /dev/null
+++ b/mmaction/models/localizers/drn/drn_utils/inference.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Copied from https://github.com/Alvin-Zeng/DRN/"""
+
+import torch
+
+
+class FCOSPostProcessor(torch.nn.Module):
+    """Performs post-processing on the outputs of the RetinaNet boxes.
+
+    This is only used in the testing.
+    """
+
+    def __init__(self, pre_nms_thresh, pre_nms_top_n, nms_thresh,
+                 fpn_post_nms_top_n, min_size, num_classes, is_first_stage):
+        """
+        Arguments:
+            pre_nms_thresh (float)
+            pre_nms_top_n (int)
+            nms_thresh (float)
+            fpn_post_nms_top_n (int)
+            min_size (int)
+            num_classes (int)
+            box_coder (BoxCoder)
+        """
+        super(FCOSPostProcessor, self).__init__()
+        self.pre_nms_thresh = pre_nms_thresh
+        self.pre_nms_top_n = pre_nms_top_n
+        self.nms_thresh = nms_thresh
+        self.fpn_post_nms_top_n = fpn_post_nms_top_n
+        self.min_size = min_size
+        self.num_classes = num_classes
+        self.innerness_threshold = 0.15
+        self.downsample_scale = 32
+        self.is_first_stage = is_first_stage
+
+    def forward_for_single_feature_map(self, locations, box_cls,
+                                       box_regression, level, iou_scores):
+        """
+        Arguments:
+            anchors: list[BoxList]
+            box_cls: tensor of size N, A * C, H, W
+            box_regression: tensor of size N, A * 4, H, W
+        """
+        N, C, T = box_cls.shape
+
+        # put in the same format as locations
+        box_cls = box_cls.permute(0, 2, 1).contiguous().sigmoid()
+        iou_scores = iou_scores.permute(0, 2, 1).contiguous().sigmoid()
+        box_regression = box_regression.permute(0, 2, 1)
+
+        # centerness = centerness.permute(0, 2, 1)
+        # centerness = centerness.reshape(N, -1).sigmoid()
+        # inner = inner.squeeze().sigmoid()
+
+        candidate_inds = (box_cls > self.pre_nms_thresh)
+        pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
+        pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n)
+
+        # multiply the classification scores with centerness scores
+        # box_cls = box_cls * centerness[:, :, None]
+        # box_cls = box_cls + centerness[:, :, None]
+        if not self.is_first_stage:
+            box_cls = box_cls * iou_scores
+
+        results = []
+        for i in range(N):
+
+            # per_centerness = centerness[i]
+
+            per_box_cls = box_cls[i]
+            per_candidate_inds = candidate_inds[i]
+            per_box_cls = per_box_cls[per_candidate_inds]
+
+            per_candidate_nonzeros = per_candidate_inds.nonzero()
+            per_box_loc = per_candidate_nonzeros[:, 0]
+            per_class = per_candidate_nonzeros[:, 1] + 1
+
+            per_box_regression = box_regression[i]
+            per_box_regression = per_box_regression[per_box_loc]
+            per_locations = locations[per_box_loc]
+
+            # per_centerness = per_centerness[per_box_loc]
+
+            per_pre_nms_top_n = pre_nms_top_n[i]
+
+            if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
+                per_box_cls, top_k_indices = \
+                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)
+                per_class = per_class[top_k_indices]
+                per_box_regression = per_box_regression[top_k_indices]
+                per_locations = per_locations[top_k_indices]
+
+                # per_centerness = per_centerness[top_k_indices]
+
+            detections = torch.stack([
+                per_locations - per_box_regression[:, 0],
+                per_locations + per_box_regression[:, 1],
+            ],
+                                     dim=1) / self.downsample_scale
+
+            detections[:, 0].clamp_(min=0, max=1)
+            detections[:, 1].clamp_(min=0, max=1)
+
+            # remove small boxes
+            p_start, p_end = detections.unbind(dim=1)
+            duration = p_end - p_start
+            keep = (duration >= self.min_size).nonzero().squeeze(1)
+            detections = detections[keep]
+
+            temp_dict = {}
+            temp_dict['detections'] = detections
+            temp_dict['labels'] = per_class
+            temp_dict['scores'] = torch.sqrt(per_box_cls)
+            temp_dict['level'] = [level]
+            # temp_dict['centerness'] = per_centerness
+            temp_dict['locations'] = per_locations / 32
+
+            results.append(temp_dict)
+
+        return results
+
+    def forward(self, locations, box_cls, box_regression, iou_scores):
+        """
+        Arguments:
+            anchors: list[list[BoxList]]
+            box_cls: list[tensor]
+            box_regression: list[tensor]
+            image_sizes: list[(h, w)]
+        Returns:
+            boxlists (list[BoxList]): the post-processed anchors, after
+                applying box decoding and NMS
+        """
+        sampled_boxes = []
+        for i, (l, o, b, iou_s) in enumerate(
+                zip(locations, box_cls, box_regression, iou_scores)):
+            sampled_boxes.append(
+                self.forward_for_single_feature_map(l, o, b, i, iou_s))
+
+        boxlists = list(zip(*sampled_boxes))
+        # boxlists = [cat_boxlist(boxlist) for boxlist in boxlists]
+        boxlists = self.select_over_all_levels(boxlists)
+
+        return boxlists
+
+    # TODO very similar to filter_results from PostProcessor
+    # but filter_results is per image
+    # TODO Yang: solve this issue in the future. No good solution
+    # right now.
+    def select_over_all_levels(self, boxlists):
+        num_images = len(boxlists)
+        results = []
+        for i in range(num_images):
+            dicts = boxlists[i]
+            per_vid_scores = []
+            per_vid_detections = []
+            per_vid_labels = []
+            # add level number
+            per_vid_level = []
+            per_vid_locations = []
+            # per_vid_centerness = []
+            for per_scale_dict in dicts:
+                if len(per_scale_dict['detections']) != 0:
+                    per_vid_detections.append(per_scale_dict['detections'])
+                if len(per_scale_dict['scores']) != 0:
+                    per_vid_scores.append(per_scale_dict['scores'])
+                if len(per_scale_dict['level']) != 0:
+                    per_vid_level.append(per_scale_dict['level'] *
+                                         len(per_scale_dict['detections']))
+
+                if len(per_scale_dict['locations']) != 0:
+                    per_vid_locations.append(per_scale_dict['locations'])
+
+                # if len(per_scale_dict['centerness']) != 0:
+                #     per_vid_centerness.append(per_scale_dict['centerness'])
+            if len(per_vid_detections) == 0:
+                per_vid_detections = torch.Tensor([0, 1]).unsqueeze(0)
+                per_vid_scores = torch.Tensor([1])
+                per_vid_level = [[-1]]
+                per_vid_locations = torch.Tensor([0.5])
+                # per_vid_centerness = torch.Tensor([0.5]).cuda()
+            else:
+                per_vid_detections = torch.cat(per_vid_detections, dim=0)
+                per_vid_scores = torch.cat(per_vid_scores, dim=0)
+                per_vid_level = per_vid_level
+                per_vid_locations = torch.cat(per_vid_locations, dim=0)
+                # per_vid_centerness = torch.cat(per_vid_centerness, dim=0)
+
+            temp_dict = {}
+            temp_dict['detections'] = per_vid_detections
+            temp_dict['labels'] = per_vid_labels
+            temp_dict['scores'] = per_vid_scores
+            temp_dict['level'] = per_vid_level
+            # temp_dict['centerness'] = per_vid_centerness
+            temp_dict['locations'] = per_vid_locations
+            results.append(temp_dict)
+
+        return results
+
+
+def make_fcos_postprocessor(fcos_num_class, fcos_inference_thr,
+                            fcos_pre_nms_top_n, fcos_nms_thr,
+                            test_detections_per_img, is_first_stage):
+    box_selector = FCOSPostProcessor(
+        pre_nms_thresh=fcos_inference_thr,
+        pre_nms_top_n=fcos_pre_nms_top_n,
+        nms_thresh=fcos_nms_thr,
+        fpn_post_nms_top_n=test_detections_per_img,
+        min_size=0,
+        num_classes=fcos_num_class,
+        is_first_stage=is_first_stage)
+
+    return box_selector
diff --git a/mmaction/models/localizers/drn/drn_utils/language_module.py b/mmaction/models/localizers/drn/drn_utils/language_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a6d03bdd9ab742853f0f50aa1c9b11b5cb58038
--- /dev/null
+++ b/mmaction/models/localizers/drn/drn_utils/language_module.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from torch import Tensor, nn
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+
+class QueryEncoder(nn.Module):
+
+    def __init__(self,
+                 vocab_size: int,
+                 hidden_dim: int = 512,
+                 embed_dim: int = 300,
+                 num_layers: int = 1,
+                 bidirection: bool = True) -> None:
+        super(QueryEncoder, self).__init__()
+        self.hidden_dim = hidden_dim
+        self.embed_dim = embed_dim
+        self.embedding = nn.Embedding(
+            num_embeddings=vocab_size + 1,
+            embedding_dim=embed_dim,
+            padding_idx=0)
+        # self.embedding.weight.data.copy_(torch.load('glove_weights'))
+        self.biLSTM = nn.LSTM(
+            input_size=embed_dim,
+            hidden_size=self.hidden_dim,
+            num_layers=num_layers,
+            dropout=0.0,
+            batch_first=True,
+            bidirectional=bidirection)
+
+        self.W3 = nn.Linear(hidden_dim * 4, hidden_dim)
+        self.W2 = nn.ModuleList(
+            [nn.Linear(hidden_dim, hidden_dim * 2) for _ in range(3)])
+        self.W1 = nn.Linear(hidden_dim * 2, 1)
+
+    def extract_textual(self, q_encoding: Tensor, lstm_outputs: Tensor,
+                        q_length: Tensor, t: int):
+        q_cmd = self.W3(q_encoding).relu()
+        q_cmd = self.W2[t](q_cmd)
+        q_cmd = q_cmd[:, None, :] * lstm_outputs
+        raw_att = self.W1(q_cmd).squeeze(-1)
+
+        raw_att = apply_mask1d(raw_att, q_length)
+        att = raw_att.softmax(dim=-1)
+        cmd = torch.bmm(att[:, None, :], lstm_outputs).squeeze(1)
+        return cmd
+
+    def forward(self, query_tokens: Tensor,
+                query_length: Tensor) -> List[Tensor]:
+        self.biLSTM.flatten_parameters()
+
+        query_embedding = self.embedding(query_tokens)
+
+        # output denotes the forward and backward hidden states in Eq 2.
+        query_embedding = pack_padded_sequence(
+            query_embedding, query_length.cpu(), batch_first=True)
+        output, _ = self.biLSTM(query_embedding)
+        output, _ = pad_packed_sequence(output, batch_first=True)
+
+        # q_vector denotes the global representation `g` in Eq 2.
+        q_vector_list = []
+
+        for i, length in enumerate(query_length):
+            h1 = output[i][0]
+            hs = output[i][length - 1]
+            q_vector = torch.cat((h1, hs), dim=-1)
+            q_vector_list.append(q_vector)
+        q_vector = torch.stack(q_vector_list)
+        # outputs denotes the query feature in Eq3 in 3 levels.
+        outputs = []
+        for cmd_t in range(3):
+            query_feat = self.extract_textual(q_vector, output, query_length,
+                                              cmd_t)
+            outputs.append(query_feat)
+
+        # Note: the output here is zero-padded
+        # we need slice the non-zero items for the following operations.
+        return outputs
+
+
+def apply_mask1d(attention: Tensor, image_locs: Tensor) -> Tensor:
+    batch_size, num_loc = attention.size()
+    tmp1 = torch.arange(
+        num_loc, dtype=attention.dtype, device=attention.device)
+    tmp1 = tmp1.expand(batch_size, num_loc)
+
+    tmp2 = image_locs.unsqueeze(dim=1).expand(batch_size, num_loc)
+    mask = tmp1 >= tmp2.to(tmp1.dtype)
+    attention = attention.masked_fill(mask, -1e30)
+    return attention
diff --git a/mmaction/models/localizers/drn/drn_utils/loss.py b/mmaction/models/localizers/drn/drn_utils/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..28631c99b5dfbd4fce0756e7ac8834844644c70b
--- /dev/null
+++ b/mmaction/models/localizers/drn/drn_utils/loss.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Adapted from https://github.com/Alvin-Zeng/DRN/"""
+
+import torch
+import torchvision
+from torch import nn
+
+INF = 100000000
+
+
+def SigmoidFocalLoss(alpha, gamma):
+
+    def loss_fn(inputs, targets):
+        loss = torchvision.ops.sigmoid_focal_loss(
+            inputs=inputs,
+            targets=targets,
+            alpha=alpha,
+            gamma=gamma,
+            reduction='sum')
+        return loss
+
+    return loss_fn
+
+
+def IOULoss():
+
+    def loss_fn(pred, target):
+        pred_left = pred[:, 0]
+        pred_right = pred[:, 1]
+
+        target_left = target[:, 0]
+        target_right = target[:, 1]
+
+        intersect = torch.min(pred_right, target_right) + torch.min(
+            pred_left, target_left)
+        target_area = target_left + target_right
+        pred_area = pred_left + pred_right
+        union = target_area + pred_area - intersect
+
+        losses = -torch.log((intersect + 1e-8) / (union + 1e-8))
+        return losses.mean()
+
+    return loss_fn
+
+
+class FCOSLossComputation(object):
+    """This class computes the FCOS losses."""
+
+    def __init__(self, focal_alpha, focal_gamma):
+        self.cls_loss_fn = SigmoidFocalLoss(focal_alpha, focal_gamma)
+        self.box_reg_loss_fn = IOULoss()
+        self.centerness_loss_fn = nn.BCEWithLogitsLoss()
+        self.iou_loss_fn = nn.SmoothL1Loss()
+
+    def prepare_targets(self, points, targets):
+        object_sizes_of_interest = [
+            [-1, 6],
+            [5.6, 11],
+            [11, INF],
+        ]
+        expanded_object_sizes_of_interest = []
+        for idx, points_per_level in enumerate(points):
+            object_sizes_of_interest_per_level = \
+                points_per_level.new_tensor(object_sizes_of_interest[idx])
+            expanded_object_sizes_of_interest.append(
+                object_sizes_of_interest_per_level[None].expand(
+                    len(points_per_level), -1))
+
+        expanded_object_sizes_of_interest = torch.cat(
+            expanded_object_sizes_of_interest, dim=0)
+        num_points_per_level = [
+            len(points_per_level) for points_per_level in points
+        ]
+        points_all_level = torch.cat(points, dim=0)
+        labels, reg_targets = self.compute_targets_for_locations(
+            points_all_level, targets, expanded_object_sizes_of_interest)
+
+        for i in range(len(labels)):
+            labels[i] = torch.split(labels[i], num_points_per_level, dim=0)
+            reg_targets[i] = torch.split(
+                reg_targets[i], num_points_per_level, dim=0)
+
+        labels_level_first = []
+        reg_targets_level_first = []
+        for level in range(len(points)):
+            labels_level_first.append(
+                torch.cat([labels_per_im[level] for labels_per_im in labels],
+                          dim=0))
+            reg_targets_level_first.append(
+                torch.cat([
+                    reg_targets_per_im[level]
+                    for reg_targets_per_im in reg_targets
+                ],
+                          dim=0))
+
+        return labels_level_first, reg_targets_level_first
+
+    def compute_targets_for_locations(self, locations, targets,
+                                      object_sizes_of_interest):
+        labels = []
+        reg_targets = []
+        ts = locations
+
+        for im_i in range(len(targets)):
+            targets_per_im = targets[im_i]
+            bboxes = targets_per_im * 32
+
+            left = ts[:, None] - bboxes[None, 0]
+            right = bboxes[None, 1] - ts[:, None]
+            reg_targets_per_im = torch.cat([left, right], dim=1)
+
+            is_in_boxes = reg_targets_per_im.min(dim=1)[0] > 0
+            max_reg_targets_per_im = reg_targets_per_im.max(dim=1)[0]
+            is_cared_in_the_level = \
+                (max_reg_targets_per_im >= object_sizes_of_interest[:, 0]) & \
+                (max_reg_targets_per_im <= object_sizes_of_interest[:, 1])
+
+            locations_to_gt_area = bboxes[1] - bboxes[0]
+            locations_to_gt_area = locations_to_gt_area.repeat(
+                len(locations), 1)
+            locations_to_gt_area[is_in_boxes == 0] = INF
+            locations_to_gt_area[is_cared_in_the_level == 0] = INF
+
+            _ = locations_to_gt_area.min(dim=1)
+            locations_to_min_area, locations_to_gt_inds = _
+
+            labels_per_im = reg_targets_per_im.new_ones(
+                len(reg_targets_per_im))
+            labels_per_im[locations_to_min_area == INF] = 0
+
+            labels.append(labels_per_im)
+            reg_targets.append(reg_targets_per_im)
+
+        return labels, reg_targets
+
+    def __call__(self,
+                 locations,
+                 box_cls,
+                 box_regression,
+                 targets,
+                 iou_scores,
+                 is_first_stage=True):
+        N = box_cls[0].size(0)
+        num_classes = box_cls[0].size(1)
+        labels, reg_targets = self.prepare_targets(locations, targets)
+
+        box_cls_flatten = []
+        box_regression_flatten = []
+        # centerness_flatten = []
+        labels_flatten = []
+        reg_targets_flatten = []
+
+        for idx in range(len(labels)):
+            box_cls_flatten.append(box_cls[idx].permute(0, 2, 1).reshape(
+                -1, num_classes))
+            box_regression_flatten.append(box_regression[idx].permute(
+                0, 2, 1).reshape(-1, 2))
+            labels_flatten.append(labels[idx].reshape(-1))
+            reg_targets_flatten.append(reg_targets[idx].reshape(-1, 2))
+
+        if not is_first_stage:
+            # [batch, 56, 2]
+            merged_box_regression = torch.cat(
+                box_regression, dim=-1).transpose(2, 1)
+            # [56]
+            merged_locations = torch.cat(locations, dim=0)
+            # [batch, 56]
+            full_locations = merged_locations[None, :].expand(
+                merged_box_regression.size(0), -1).contiguous()
+            pred_start = full_locations - merged_box_regression[:, :, 0]
+            pred_end = full_locations + merged_box_regression[:, :, 1]
+            # [batch, 56, 2]
+            predictions = torch.cat(
+                [pred_start.unsqueeze(-1),
+                 pred_end.unsqueeze(-1)], dim=-1) / 32
+            # TODO: make sure the predictions are legal. (e.g. start < end)
+            predictions.clamp_(min=0, max=1)
+            # gt: [batch, 2]
+            gt_box = targets[:, None, :]
+
+            iou_target = segment_tiou(predictions, gt_box)
+            iou_pred = torch.cat(iou_scores, dim=-1).squeeze().sigmoid()
+            iou_pos_ind = iou_target > 0.9
+            pos_iou_target = iou_target[iou_pos_ind]
+
+            pos_iou_pred = iou_pred[iou_pos_ind]
+
+            if iou_pos_ind.sum().item() == 0:
+                iou_loss = torch.tensor([0.]).to(iou_pos_ind.device)
+            else:
+                iou_loss = self.iou_loss_fn(pos_iou_pred, pos_iou_target)
+
+        box_cls_flatten = torch.cat(box_cls_flatten, dim=0)
+        box_regression_flatten = torch.cat(box_regression_flatten, dim=0)
+        labels_flatten = torch.cat(labels_flatten, dim=0)
+        reg_targets_flatten = torch.cat(reg_targets_flatten, dim=0)
+
+        pos_inds = torch.nonzero(labels_flatten > 0).squeeze(1)
+        cls_loss = self.cls_loss_fn(
+            box_cls_flatten, labels_flatten.unsqueeze(1)) / (
+                pos_inds.numel() + N)  # add N to avoid dividing by a zero
+
+        box_regression_flatten = box_regression_flatten[pos_inds]
+        reg_targets_flatten = reg_targets_flatten[pos_inds]
+
+        if pos_inds.numel() > 0:
+            reg_loss = self.box_reg_loss_fn(
+                box_regression_flatten,
+                reg_targets_flatten,
+            )
+        else:
+            reg_loss = box_regression_flatten.sum()
+
+        if not is_first_stage:
+            return cls_loss, reg_loss, iou_loss
+
+        return cls_loss, reg_loss, torch.tensor([0.]).to(cls_loss.device)
+
+
+def segment_tiou(box_a, box_b):
+
+    # gt: [batch, 1, 2], detections: [batch, 56, 2]
+    # calculate interaction
+    inter_max_xy = torch.min(box_a[:, :, -1], box_b[:, :, -1])
+    inter_min_xy = torch.max(box_a[:, :, 0], box_b[:, :, 0])
+    inter = torch.clamp((inter_max_xy - inter_min_xy), min=0)
+
+    # calculate union
+    union_max_xy = torch.max(box_a[:, :, -1], box_b[:, :, -1])
+    union_min_xy = torch.min(box_a[:, :, 0], box_b[:, :, 0])
+    union = torch.clamp((union_max_xy - union_min_xy), min=0)
+
+    iou = inter / (union + 1e-6)
+
+    return iou
+
+
+def make_fcos_loss_evaluator(focal_alpha, focal_gamma):
+    loss_evaluator = FCOSLossComputation(focal_alpha, focal_gamma)
+    return loss_evaluator
diff --git a/mmaction/models/localizers/tcanet.py b/mmaction/models/localizers/tcanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..13474edb74f2f04f5999af78b12ffe8425f30f0f
--- /dev/null
+++ b/mmaction/models/localizers/tcanet.py
@@ -0,0 +1,513 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmengine.model import BaseModel
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import OptConfigType
+from .utils import (batch_iou, bbox_se_transform_batch, bbox_se_transform_inv,
+                    bbox_xw_transform_batch, bbox_xw_transform_inv,
+                    post_processing)
+
+
+class LGTE(BaseModel):
+    """Local-Global Temporal Encoder (LGTE)
+
+    Args:
+        input_dim (int): Input feature dimension.
+        dropout (float): the dropout rate for the residual branch of
+            self-attention and ffn.
+        temporal_dim (int): Total frames selected for each video.
+            Defaults to 100.
+        window_size (int): the window size for Local Temporal Encoder.
+            Defaults to 9.
+        init_cfg (dict or ConfigDict, optional): The Config for
+            initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 input_dim: int,
+                 dropout: float,
+                 temporal_dim: int = 100,
+                 window_size: int = 9,
+                 num_heads: int = 8,
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super(LGTE, self).__init__(init_cfg)
+
+        self.atten = MultiheadAttention(
+            embed_dims=input_dim,
+            num_heads=num_heads,
+            proj_drop=dropout,
+            attn_drop=0.1)
+        self.ffn = FFN(
+            embed_dims=input_dim, feedforward_channels=256, ffn_drop=dropout)
+
+        norm_cfg = dict(type='LN', eps=1e-6)
+        self.norm1 = build_norm_layer(norm_cfg, input_dim)[1]
+        self.norm2 = build_norm_layer(norm_cfg, input_dim)[1]
+
+        mask = self._mask_matrix(num_heads, temporal_dim, window_size)
+        self.register_buffer('mask', mask)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward call for LGTE.
+
+        Args:
+            x (torch.Tensor): The input tensor with shape (B, C, L)
+        """
+        x = x.permute(2, 0, 1)
+        mask = self.mask.repeat(x.size(1), 1, 1, 1)
+        L = x.shape[0]
+        x = self.atten(x, attn_mask=mask.reshape(-1, L, L))
+        x = self.norm1(x)
+        x = self.ffn(x)
+        x = self.norm2(x)
+        x = x.permute(1, 2, 0)
+        return x
+
+    @staticmethod
+    def _mask_matrix(num_heads: int, temporal_dim: int,
+                     window_size: int) -> Tensor:
+        mask = torch.zeros(num_heads, temporal_dim, temporal_dim)
+        index = torch.arange(temporal_dim)
+
+        for i in range(num_heads // 2):
+            for j in range(temporal_dim):
+                ignored = (index - j).abs() > window_size / 2
+                mask[i, j] = ignored
+
+        return mask.unsqueeze(0).bool()
+
+
+def StartEndRegressor(sample_num: int, feat_dim: int) -> nn.Module:
+    """Start and End Regressor in the Temporal Boundary Regressor.
+
+    Args:
+        sample_num (int): number of samples for the start & end.
+        feat_dim (int): feature dimension.
+
+    Returns:
+        A pytorch module that works as the start and end regressor. The input
+        of the module should have a shape of (B, feat_dim * 2, sample_num).
+    """
+    hidden_dim = 128
+    regressor = nn.Sequential(
+        nn.Conv1d(
+            feat_dim * 2,
+            hidden_dim * 2,
+            kernel_size=3,
+            padding=1,
+            groups=8,
+            stride=2), nn.ReLU(inplace=True),
+        nn.Conv1d(
+            hidden_dim * 2,
+            hidden_dim * 2,
+            kernel_size=3,
+            padding=1,
+            groups=8,
+            stride=2), nn.ReLU(inplace=True),
+        nn.Conv1d(hidden_dim * 2, 2, kernel_size=sample_num // 4, groups=2),
+        nn.Flatten())
+    return regressor
+
+
+def CenterWidthRegressor(temporal_len: int, feat_dim: int) -> nn.Module:
+    """Center Width in the Temporal Boundary Regressor.
+
+    Args:
+        temporal_len (int): temporal dimension of the inputs.
+        feat_dim (int): feature dimension.
+
+    Returns:
+        A pytorch module that works as the start and end regressor. The input
+        of the module should have a shape of (B, feat_dim, temporal_len).
+    """
+    hidden_dim = 512
+    regressor = nn.Sequential(
+        nn.Conv1d(
+            feat_dim, hidden_dim, kernel_size=3, padding=1, groups=4,
+            stride=2), nn.ReLU(inplace=True),
+        nn.Conv1d(
+            hidden_dim,
+            hidden_dim,
+            kernel_size=3,
+            padding=1,
+            groups=4,
+            stride=2), nn.ReLU(inplace=True),
+        nn.Conv1d(
+            hidden_dim, hidden_dim, kernel_size=temporal_len // 4, groups=4),
+        nn.ReLU(inplace=True), nn.Conv1d(hidden_dim, 3, kernel_size=1))
+    return regressor
+
+
+class TemporalTransform:
+    """Temporal Transform to sample temporal features."""
+
+    def __init__(self, prop_boundary_ratio: float, action_sample_num: int,
+                 se_sample_num: int, temporal_interval: int):
+        super(TemporalTransform, self).__init__()
+        self.temporal_interval = temporal_interval
+        self.prop_boundary_ratio = prop_boundary_ratio
+        self.action_sample_num = action_sample_num
+        self.se_sample_num = se_sample_num
+
+    def __call__(self, segments: Tensor, features: Tensor) -> List[Tensor]:
+        s_len = segments[:, 1] - segments[:, 0]
+        starts_segments = [
+            segments[:, 0] - self.prop_boundary_ratio * s_len, segments[:, 0]
+        ]
+        starts_segments = torch.stack(starts_segments, dim=1)
+
+        ends_segments = [
+            segments[:, 1], segments[:, 1] + self.prop_boundary_ratio * s_len
+        ]
+        ends_segments = torch.stack(ends_segments, dim=1)
+
+        starts_feature = self._sample_one_temporal(starts_segments,
+                                                   self.se_sample_num,
+                                                   features)
+        ends_feature = self._sample_one_temporal(ends_segments,
+                                                 self.se_sample_num, features)
+        actions_feature = self._sample_one_temporal(segments,
+                                                    self.action_sample_num,
+                                                    features)
+        return starts_feature, actions_feature, ends_feature
+
+    def _sample_one_temporal(self, segments: Tensor, out_len: int,
+                             features: Tensor) -> Tensor:
+        segments = segments.clamp(0, 1) * 2 - 1
+        theta = segments.new_zeros((features.size(0), 2, 3))
+        theta[:, 1, 1] = 1.0
+        theta[:, 0, 0] = (segments[:, 1] - segments[:, 0]) / 2.0
+        theta[:, 0, 2] = (segments[:, 1] + segments[:, 0]) / 2.0
+
+        size = torch.Size((*features.shape[:2], 1, out_len))
+        grid = F.affine_grid(theta, size)
+        stn_feature = F.grid_sample(features.unsqueeze(2), grid)
+        stn_feature = stn_feature.view(*features.shape[:2], out_len)
+        return stn_feature
+
+
+class TBR(BaseModel):
+    """Temporal Boundary Regressor (TBR)"""
+
+    def __init__(self,
+                 se_sample_num: int,
+                 action_sample_num: int,
+                 temporal_dim: int,
+                 prop_boundary_ratio: float = 0.5,
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super(TBR, self).__init__(init_cfg)
+
+        hidden_dim = 512
+
+        self.reg1se = StartEndRegressor(se_sample_num, hidden_dim)
+        temporal_len = se_sample_num * 2 + action_sample_num
+        self.reg1xw = CenterWidthRegressor(temporal_len, hidden_dim)
+        self.ttn = TemporalTransform(prop_boundary_ratio, action_sample_num,
+                                     se_sample_num, temporal_dim)
+
+    def forward(self, proposals: Tensor, features: Tensor, gt_boxes: Tensor,
+                iou_thres: float, training: bool) -> tuple:
+        proposals1 = proposals[:, :2]
+        starts_feat1, actions_feat1, ends_feat1 = self.ttn(
+            proposals1, features)
+
+        reg1se = self.reg1se(torch.cat([starts_feat1, ends_feat1], dim=1))
+
+        features1xw = torch.cat([starts_feat1, actions_feat1, ends_feat1],
+                                dim=2)
+        reg1xw = self.reg1xw(features1xw).squeeze(2)
+
+        preds_iou1 = reg1xw[:, 2].sigmoid()
+        reg1xw = reg1xw[:, :2]
+
+        if training:
+            proposals2xw = bbox_xw_transform_inv(proposals1, reg1xw, 0.1, 0.2)
+            proposals2se = bbox_se_transform_inv(proposals1, reg1se, 1.0)
+
+            iou1 = batch_iou(proposals1, gt_boxes)
+            targets1se = bbox_se_transform_batch(proposals1, gt_boxes)
+            targets1xw = bbox_xw_transform_batch(proposals1, gt_boxes)
+            rloss1se = self.regress_loss(reg1se, targets1se, iou1, iou_thres)
+            rloss1xw = self.regress_loss(reg1xw, targets1xw, iou1, iou_thres)
+            rloss1 = rloss1se + rloss1xw
+            iloss1 = self.iou_loss(preds_iou1, iou1, iou_thres=iou_thres)
+        else:
+            proposals2xw = bbox_xw_transform_inv(proposals1, reg1xw, 0.1, 0.2)
+            proposals2se = bbox_se_transform_inv(proposals1, reg1se, 0.2)
+            rloss1 = iloss1 = 0
+        proposals2 = (proposals2se + proposals2xw) / 2.0
+        proposals2 = torch.clamp(proposals2, min=0.)
+        return preds_iou1, proposals2, rloss1, iloss1
+
+    def regress_loss(self, regression, targets, iou_with_gt, iou_thres):
+        weight = (iou_with_gt >= iou_thres).float().unsqueeze(1)
+        reg_loss = F.smooth_l1_loss(regression, targets, reduction='none')
+        if weight.sum() > 0:
+            reg_loss = (weight * reg_loss).sum() / weight.sum()
+        else:
+            reg_loss = (weight * reg_loss).sum()
+        return reg_loss
+
+    def iou_loss(self, preds_iou, match_iou, iou_thres):
+        preds_iou = preds_iou.view(-1)
+        u_hmask = (match_iou > iou_thres).float()
+        u_mmask = ((match_iou <= iou_thres) & (match_iou > 0.3)).float()
+        u_lmask = (match_iou <= 0.3).float()
+
+        num_h, num_m, num_l = u_hmask.sum(), u_mmask.sum(), u_lmask.sum()
+
+        bs, device = u_hmask.size()[0], u_hmask.device
+
+        r_m = min(num_h / num_m, 1)
+        u_smmask = torch.rand(bs, device=device) * u_mmask
+        u_smmask = (u_smmask > (1. - r_m)).float()
+
+        r_l = min(num_h / num_l, 1)
+        u_slmask = torch.rand(bs, device=device) * u_lmask
+        u_slmask = (u_slmask > (1. - r_l)).float()
+
+        iou_weights = u_hmask + u_smmask + u_slmask
+        iou_loss = F.smooth_l1_loss(preds_iou, match_iou, reduction='none')
+        if iou_weights.sum() > 0:
+            iou_loss = (iou_loss * iou_weights).sum() / iou_weights.sum()
+        else:
+            iou_loss = (iou_loss * iou_weights).sum()
+        return iou_loss
+
+
+@MODELS.register_module()
+class TCANet(BaseModel):
+    """Temporal Context Aggregation Network.
+
+    Please refer `Temporal Context Aggregation Network for Temporal Action
+    Proposal Refinement <https://arxiv.org/abs/2103.13141>`_.
+    Code Reference:
+    https://github.com/qinzhi-0110/Temporal-Context-Aggregation-Network-Pytorch
+    """
+
+    def __init__(self,
+                 feat_dim: int = 2304,
+                 se_sample_num: int = 32,
+                 action_sample_num: int = 64,
+                 temporal_dim: int = 100,
+                 window_size: int = 9,
+                 lgte_num: int = 2,
+                 soft_nms_alpha: float = 0.4,
+                 soft_nms_low_threshold: float = 0.0,
+                 soft_nms_high_threshold: float = 0.0,
+                 post_process_top_k: int = 100,
+                 feature_extraction_interval: int = 16,
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super(TCANet, self).__init__(init_cfg)
+
+        self.soft_nms_alpha = soft_nms_alpha
+        self.soft_nms_low_threshold = soft_nms_low_threshold
+        self.soft_nms_high_threshold = soft_nms_high_threshold
+        self.feature_extraction_interval = feature_extraction_interval
+        self.post_process_top_k = post_process_top_k
+
+        hidden_dim = 512
+        self.x_1d_b_f = nn.Sequential(
+            nn.Conv1d(
+                feat_dim, hidden_dim, kernel_size=3, padding=1, groups=4),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(
+                hidden_dim, hidden_dim, kernel_size=3, padding=1, groups=4),
+            nn.ReLU(inplace=True),
+        )
+
+        for i in 1, 2, 3:
+            tbr = TBR(
+                se_sample_num=se_sample_num,
+                action_sample_num=action_sample_num,
+                temporal_dim=temporal_dim,
+                init_cfg=init_cfg,
+                **kwargs)
+            setattr(self, f'tbr{i}', tbr)
+
+        self.lgtes = nn.ModuleList([
+            LGTE(
+                input_dim=hidden_dim,
+                dropout=0.1,
+                temporal_dim=temporal_dim,
+                window_size=window_size,
+                init_cfg=init_cfg,
+                **kwargs) for i in range(lgte_num)
+        ])
+
+    def forward(self, inputs, data_samples, mode, **kwargs):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes:
+
+        - ``tensor``: Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - ``predict``: Forward and return the predictions, which are fully
+        processed to a list of :obj:`ActionDataSample`.
+        - ``loss``: Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (Tensor): The input tensor with shape
+                (N, C, ...) in general.
+            data_samples (List[:obj:`ActionDataSample`], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to ``tensor``.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of ``ActionDataSample``.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if not isinstance(input, Tensor):
+            inputs = torch.stack(inputs)
+        if mode == 'tensor':
+            return self._forward(inputs, **kwargs)
+        if mode == 'predict':
+            return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    def _forward(self, x):
+        """Define the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        x = self.x_1d_b_f(x)
+        for layer in self.lgtes:
+            x = layer(x)
+        return x
+
+    def loss(self, batch_inputs, batch_data_samples, **kwargs):
+        features = self._forward(batch_inputs)
+        proposals_ = [
+            sample.proposals['proposals'] for sample in batch_data_samples
+        ]
+
+        batch_size = len(proposals_)
+        proposals_num = max([_.shape[0] for _ in proposals_])
+
+        proposals = torch.zeros((batch_size, proposals_num, 3),
+                                device=features.device)
+        for i, proposal in enumerate(proposals_):
+            proposals[i, :proposal.shape[0]] = proposal
+
+        gt_boxes_ = [
+            sample.gt_instances['gt_bbox'] for sample in batch_data_samples
+        ]
+        gt_boxes = torch.zeros((batch_size, proposals_num, 2),
+                               device=features.device)
+        for i, gt_box in enumerate(gt_boxes_):
+            L = gt_box.shape[0]
+            if L <= proposals_num:
+                gt_boxes[i, :L] = gt_box
+            else:
+                random_index = torch.randperm(L)[:proposals_num]
+                gt_boxes[i] = gt_box[random_index]
+
+        for i in range(batch_size):
+            proposals[i, :, 2] = i
+        proposals = proposals.view(batch_size * proposals_num, 3)
+        proposals_select = proposals[:, 0:2].sum(dim=1) > 0
+        proposals = proposals[proposals_select, :]
+
+        features = features[proposals[:, 2].long()]
+
+        gt_boxes = gt_boxes.view(batch_size * proposals_num, 2)
+        gt_boxes = gt_boxes[proposals_select, :]
+
+        _, proposals1, rloss1, iloss1 = self.tbr1(proposals, features,
+                                                  gt_boxes, 0.5, True)
+        _, proposals2, rloss2, iloss2 = self.tbr2(proposals1, features,
+                                                  gt_boxes, 0.6, True)
+        _, _, rloss3, iloss3 = self.tbr3(proposals2, features, gt_boxes, 0.7,
+                                         True)
+
+        loss_dict = dict(
+            rloss1=rloss1,
+            rloss2=rloss2,
+            rloss3=rloss3,
+            iloss1=iloss1,
+            iloss2=iloss2,
+            iloss3=iloss3)
+        return loss_dict
+
+    def predict(self, batch_inputs, batch_data_samples, **kwargs):
+        features = self._forward(batch_inputs)
+        proposals_ = [
+            sample.proposals['proposals'] for sample in batch_data_samples
+        ]
+
+        batch_size = len(proposals_)
+        proposals_num = max([_.shape[0] for _ in proposals_])
+
+        proposals = torch.zeros((batch_size, proposals_num, 3),
+                                device=features.device)
+        for i, proposal in enumerate(proposals_):
+            proposals[i, :proposal.shape[0]] = proposal
+
+        scores = proposals[:, :, 2]
+        for i in range(batch_size):
+            proposals[i, :, 2] = i
+
+        proposals = proposals.view(batch_size * proposals_num, 3)
+        proposals_select = proposals[:, 0:2].sum(dim=1) > 0
+        proposals = proposals[proposals_select, :]
+        scores = scores.view(-1)[proposals_select]
+
+        features = features[proposals[:, 2].long()]
+
+        preds_iou1, proposals1 = self.tbr1(proposals, features, None, 0.5,
+                                           False)[:2]
+        preds_iou2, proposals2 = self.tbr2(proposals1, features, None, 0.6,
+                                           False)[:2]
+        preds_iou3, proposals3 = self.tbr3(proposals2, features, None, 0.7,
+                                           False)[:2]
+
+        all_proposals = []
+        # all_proposals = [proposals]
+        all_proposals += [
+            torch.cat([proposals1, (scores * preds_iou1).view(-1, 1)], dim=1)
+        ]
+        all_proposals += [
+            torch.cat([proposals2, (scores * preds_iou2).view(-1, 1)], dim=1)
+        ]
+        all_proposals += [
+            torch.cat([proposals3, (scores * preds_iou3).view(-1, 1)], dim=1)
+        ]
+
+        all_proposals = torch.cat(all_proposals, dim=0).cpu().numpy()
+        video_info = batch_data_samples[0].metainfo
+        proposal_list = post_processing(all_proposals, video_info,
+                                        self.soft_nms_alpha,
+                                        self.soft_nms_low_threshold,
+                                        self.soft_nms_high_threshold,
+                                        self.post_process_top_k,
+                                        self.feature_extraction_interval)
+        output = [
+            dict(
+                video_name=video_info['video_name'],
+                proposal_list=proposal_list)
+        ]
+        return output
diff --git a/mmaction/models/localizers/utils/__init__.py b/mmaction/models/localizers/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53880641d5e611acc208ed0fa067a977b39b7d41
--- /dev/null
+++ b/mmaction/models/localizers/utils/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bsn_utils import generate_bsp_feature, generate_candidate_proposals
+from .proposal_utils import (post_processing, soft_nms, temporal_iop,
+                             temporal_iou)
+from .tcanet_utils import (batch_iou, bbox_se_transform_batch,
+                           bbox_se_transform_inv, bbox_xw_transform_batch,
+                           bbox_xw_transform_inv)
+
+__all__ = [
+    'batch_iou', 'bbox_se_transform_batch', 'bbox_se_transform_inv',
+    'bbox_xw_transform_batch', 'bbox_xw_transform_inv', 'generate_bsp_feature',
+    'generate_candidate_proposals', 'post_processing', 'soft_nms',
+    'temporal_iop', 'temporal_iou'
+]
diff --git a/mmaction/models/localizers/utils/__pycache__/__init__.cpython-312.pyc b/mmaction/models/localizers/utils/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a0d887dccf93ca07a01356e1928564a14dc1fde
Binary files /dev/null and b/mmaction/models/localizers/utils/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/utils/__pycache__/bsn_utils.cpython-312.pyc b/mmaction/models/localizers/utils/__pycache__/bsn_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11ea2aaa4694adba26dda1131b297038883d0d85
Binary files /dev/null and b/mmaction/models/localizers/utils/__pycache__/bsn_utils.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/utils/__pycache__/proposal_utils.cpython-312.pyc b/mmaction/models/localizers/utils/__pycache__/proposal_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af2620ccf50cbf6126bb093db75d88653590f8d6
Binary files /dev/null and b/mmaction/models/localizers/utils/__pycache__/proposal_utils.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/utils/__pycache__/tcanet_utils.cpython-312.pyc b/mmaction/models/localizers/utils/__pycache__/tcanet_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70222ecdeb5f17d818b99345bd1e92415ee4de32
Binary files /dev/null and b/mmaction/models/localizers/utils/__pycache__/tcanet_utils.cpython-312.pyc differ
diff --git a/mmaction/models/localizers/utils/bsn_utils.py b/mmaction/models/localizers/utils/bsn_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..247dc8a0a850110627e8d5297a02c0cca881f492
--- /dev/null
+++ b/mmaction/models/localizers/utils/bsn_utils.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import numpy as np
+
+from .proposal_utils import temporal_iop, temporal_iou
+
+
+def generate_candidate_proposals(video_list,
+                                 video_infos,
+                                 tem_results_dir,
+                                 temporal_scale,
+                                 peak_threshold,
+                                 tem_results_ext='.csv',
+                                 result_dict=None):
+    """Generate Candidate Proposals with given temporal evaluation results.
+
+    Each proposal file will contain:
+    'tmin,tmax,tmin_score,tmax_score,score,match_iou,match_ioa'.
+    Args:
+        video_list (list[int]): List of video indexes to generate proposals.
+        video_infos (list[dict]): List of video_info dict that contains
+            'video_name', 'duration_frame', 'duration_second',
+            'feature_frame', and 'annotations'.
+        tem_results_dir (str): Directory to load temporal evaluation
+            results.
+        temporal_scale (int): The number (scale) on temporal axis.
+        peak_threshold (float): The threshold for proposal generation.
+        tem_results_ext (str): File extension for temporal evaluation
+            model output. Default: '.csv'.
+        result_dict (dict | None): The dict to save the results. Default: None.
+    Returns:
+        dict: A dict contains video_name as keys and proposal list as value.
+            If result_dict is not None, save the results to it.
+    """
+    if tem_results_ext != '.csv':
+        raise NotImplementedError('Only support csv format now.')
+
+    tscale = temporal_scale
+    tgap = 1. / tscale
+    proposal_dict = {}
+    for video_index in video_list:
+        video_name = video_infos[video_index]['video_name']
+        tem_path = osp.join(tem_results_dir, video_name + tem_results_ext)
+        tem_results = np.loadtxt(
+            tem_path, dtype=np.float32, delimiter=',', skiprows=1)
+        start_scores = tem_results[:, 1]
+        end_scores = tem_results[:, 2]
+
+        max_start = max(start_scores)
+        max_end = max(end_scores)
+
+        start_bins = np.zeros(len(start_scores))
+        start_bins[[0, -1]] = 1
+        end_bins = np.zeros(len(end_scores))
+        end_bins[[0, -1]] = 1
+        for idx in range(1, tscale - 1):
+            if start_scores[idx] > start_scores[
+                    idx + 1] and start_scores[idx] > start_scores[idx - 1]:
+                start_bins[idx] = 1
+            elif start_scores[idx] > (peak_threshold * max_start):
+                start_bins[idx] = 1
+            if end_scores[idx] > end_scores[
+                    idx + 1] and end_scores[idx] > end_scores[idx - 1]:
+                end_bins[idx] = 1
+            elif end_scores[idx] > (peak_threshold * max_end):
+                end_bins[idx] = 1
+
+        tmin_list = []
+        tmin_score_list = []
+        tmax_list = []
+        tmax_score_list = []
+        for idx in range(tscale):
+            if start_bins[idx] == 1:
+                tmin_list.append(tgap / 2 + tgap * idx)
+                tmin_score_list.append(start_scores[idx])
+            if end_bins[idx] == 1:
+                tmax_list.append(tgap / 2 + tgap * idx)
+                tmax_score_list.append(end_scores[idx])
+
+        new_props = []
+        for tmax, tmax_score in zip(tmax_list, tmax_score_list):
+            for tmin, tmin_score in zip(tmin_list, tmin_score_list):
+                if tmin >= tmax:
+                    break
+                new_props.append([tmin, tmax, tmin_score, tmax_score])
+
+        new_props = np.stack(new_props)
+
+        score = (new_props[:, 2] * new_props[:, 3]).reshape(-1, 1)
+        new_props = np.concatenate((new_props, score), axis=1)
+
+        new_props = new_props[new_props[:, -1].argsort()[::-1]]
+        video_info = video_infos[video_index]
+        video_frame = video_info['duration_frame']
+        video_second = video_info['duration_second']
+        feature_frame = video_info['feature_frame']
+        corrected_second = float(feature_frame) / video_frame * video_second
+
+        gt_tmins = []
+        gt_tmaxs = []
+        for annotations in video_info['annotations']:
+            gt_tmins.append(annotations['segment'][0] / corrected_second)
+            gt_tmaxs.append(annotations['segment'][1] / corrected_second)
+
+        new_iou_list = []
+        new_ioa_list = []
+        for new_prop in new_props:
+            new_iou = max(
+                temporal_iou(new_prop[0], new_prop[1], gt_tmins, gt_tmaxs))
+            new_ioa = max(
+                temporal_iop(new_prop[0], new_prop[1], gt_tmins, gt_tmaxs))
+            new_iou_list.append(new_iou)
+            new_ioa_list.append(new_ioa)
+
+        new_iou_list = np.array(new_iou_list).reshape(-1, 1)
+        new_ioa_list = np.array(new_ioa_list).reshape(-1, 1)
+        new_props = np.concatenate((new_props, new_iou_list), axis=1)
+        new_props = np.concatenate((new_props, new_ioa_list), axis=1)
+        proposal_dict[video_name] = new_props
+        if result_dict is not None:
+            result_dict[video_name] = new_props
+    return proposal_dict
+
+
+def generate_bsp_feature(video_list,
+                         video_infos,
+                         tem_results_dir,
+                         pgm_proposals_dir,
+                         top_k=1000,
+                         bsp_boundary_ratio=0.2,
+                         num_sample_start=8,
+                         num_sample_end=8,
+                         num_sample_action=16,
+                         num_sample_interp=3,
+                         tem_results_ext='.csv',
+                         pgm_proposal_ext='.csv',
+                         result_dict=None):
+    """Generate Boundary-Sensitive Proposal Feature with given proposals.
+
+    Args:
+        video_list (list[int]): List of video indexes to generate bsp_feature.
+        video_infos (list[dict]): List of video_info dict that contains
+            'video_name'.
+        tem_results_dir (str): Directory to load temporal evaluation
+            results.
+        pgm_proposals_dir (str): Directory to load proposals.
+        top_k (int): Number of proposals to be considered. Default: 1000
+        bsp_boundary_ratio (float): Ratio for proposal boundary
+            (start/end). Default: 0.2.
+        num_sample_start (int): Num of samples for actionness in
+            start region. Default: 8.
+        num_sample_end (int): Num of samples for actionness in end region.
+            Default: 8.
+        num_sample_action (int): Num of samples for actionness in center
+            region. Default: 16.
+        num_sample_interp (int): Num of samples for interpolation for
+            each sample point. Default: 3.
+        tem_results_ext (str): File extension for temporal evaluation
+            model output. Default: '.csv'.
+        pgm_proposal_ext (str): File extension for proposals. Default: '.csv'.
+        result_dict (dict | None): The dict to save the results. Default: None.
+    Returns:
+        bsp_feature_dict (dict): A dict contains video_name as keys and
+            bsp_feature as value. If result_dict is not None, save the
+            results to it.
+    """
+    if tem_results_ext != '.csv' or pgm_proposal_ext != '.csv':
+        raise NotImplementedError('Only support csv format now.')
+
+    bsp_feature_dict = {}
+    for video_index in video_list:
+        video_name = video_infos[video_index]['video_name']
+
+        # Load temporal evaluation results
+        tem_path = osp.join(tem_results_dir, video_name + tem_results_ext)
+        tem_results = np.loadtxt(
+            tem_path, dtype=np.float32, delimiter=',', skiprows=1)
+        score_action = tem_results[:, 0]
+        seg_tmins = tem_results[:, 3]
+        seg_tmaxs = tem_results[:, 4]
+        video_scale = len(tem_results)
+        video_gap = seg_tmaxs[0] - seg_tmins[0]
+        video_extend = int(video_scale / 4 + 10)
+
+        # Load proposals results
+        proposal_path = osp.join(pgm_proposals_dir,
+                                 video_name + pgm_proposal_ext)
+        pgm_proposals = np.loadtxt(
+            proposal_path, dtype=np.float32, delimiter=',', skiprows=1)
+        pgm_proposals = pgm_proposals[:top_k]
+
+        # Generate temporal sample points
+        boundary_zeros = np.zeros([video_extend])
+        score_action = np.concatenate(
+            (boundary_zeros, score_action, boundary_zeros))
+        begin_tp = []
+        middle_tp = []
+        end_tp = []
+        for i in range(video_extend):
+            begin_tp.append(-video_gap / 2 -
+                            (video_extend - 1 - i) * video_gap)
+            end_tp.append(video_gap / 2 + seg_tmaxs[-1] + i * video_gap)
+        for i in range(video_scale):
+            middle_tp.append(video_gap / 2 + i * video_gap)
+        t_points = begin_tp + middle_tp + end_tp
+
+        bsp_feature = []
+        for pgm_proposal in pgm_proposals:
+            tmin = pgm_proposal[0]
+            tmax = pgm_proposal[1]
+
+            tlen = tmax - tmin
+            # Temporal range for start
+            tmin_0 = tmin - tlen * bsp_boundary_ratio
+            tmin_1 = tmin + tlen * bsp_boundary_ratio
+            # Temporal range for end
+            tmax_0 = tmax - tlen * bsp_boundary_ratio
+            tmax_1 = tmax + tlen * bsp_boundary_ratio
+
+            # Generate features at start boundary
+            tlen_start = (tmin_1 - tmin_0) / (num_sample_start - 1)
+            tlen_start_sample = tlen_start / num_sample_interp
+            t_new = [
+                tmin_0 - tlen_start / 2 + tlen_start_sample * i
+                for i in range(num_sample_start * num_sample_interp + 1)
+            ]
+            y_new_start_action = np.interp(t_new, t_points, score_action)
+            y_new_start = [
+                np.mean(y_new_start_action[i * num_sample_interp:(i + 1) *
+                                           num_sample_interp + 1])
+                for i in range(num_sample_start)
+            ]
+            # Generate features at end boundary
+            tlen_end = (tmax_1 - tmax_0) / (num_sample_end - 1)
+            tlen_end_sample = tlen_end / num_sample_interp
+            t_new = [
+                tmax_0 - tlen_end / 2 + tlen_end_sample * i
+                for i in range(num_sample_end * num_sample_interp + 1)
+            ]
+            y_new_end_action = np.interp(t_new, t_points, score_action)
+            y_new_end = [
+                np.mean(y_new_end_action[i * num_sample_interp:(i + 1) *
+                                         num_sample_interp + 1])
+                for i in range(num_sample_end)
+            ]
+            # Generate features for action
+            tlen_action = (tmax - tmin) / (num_sample_action - 1)
+            tlen_action_sample = tlen_action / num_sample_interp
+            t_new = [
+                tmin - tlen_action / 2 + tlen_action_sample * i
+                for i in range(num_sample_action * num_sample_interp + 1)
+            ]
+            y_new_action = np.interp(t_new, t_points, score_action)
+            y_new_action = [
+                np.mean(y_new_action[i * num_sample_interp:(i + 1) *
+                                     num_sample_interp + 1])
+                for i in range(num_sample_action)
+            ]
+            feature = np.concatenate([y_new_action, y_new_start, y_new_end])
+            bsp_feature.append(feature)
+        bsp_feature = np.array(bsp_feature)
+        bsp_feature_dict[video_name] = bsp_feature
+        if result_dict is not None:
+            result_dict[video_name] = bsp_feature
+    return bsp_feature_dict
diff --git a/mmaction/models/localizers/utils/proposal_utils.py b/mmaction/models/localizers/utils/proposal_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..925084bf37df82a4f2b2d584d628310804748d79
--- /dev/null
+++ b/mmaction/models/localizers/utils/proposal_utils.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+
+def temporal_iou(proposal_min, proposal_max, gt_min, gt_max):
+    """Compute IoU score between a groundtruth bbox and the proposals.
+
+    Args:
+        proposal_min (list[float]): List of temporal anchor min.
+        proposal_max (list[float]): List of temporal anchor max.
+        gt_min (float): Groundtruth temporal box min.
+        gt_max (float): Groundtruth temporal box max.
+    Returns:
+        list[float]: List of iou scores.
+    """
+    len_anchors = proposal_max - proposal_min
+    int_tmin = np.maximum(proposal_min, gt_min)
+    int_tmax = np.minimum(proposal_max, gt_max)
+    inter_len = np.maximum(int_tmax - int_tmin, 0.)
+    union_len = len_anchors - inter_len + gt_max - gt_min
+    jaccard = np.divide(inter_len, union_len)
+    return jaccard
+
+
+def temporal_iop(proposal_min, proposal_max, gt_min, gt_max):
+    """Compute IoP score between a groundtruth bbox and the proposals.
+
+    Compute the IoP which is defined as the overlap ratio with
+    groundtruth proportional to the duration of this proposal.
+    Args:
+        proposal_min (list[float]): List of temporal anchor min.
+        proposal_max (list[float]): List of temporal anchor max.
+        gt_min (float): Groundtruth temporal box min.
+        gt_max (float): Groundtruth temporal box max.
+    Returns:
+        list[float]: List of intersection over anchor scores.
+    """
+    len_anchors = np.array(proposal_max - proposal_min)
+    int_tmin = np.maximum(proposal_min, gt_min)
+    int_tmax = np.minimum(proposal_max, gt_max)
+    inter_len = np.maximum(int_tmax - int_tmin, 0.)
+    scores = np.divide(inter_len, len_anchors)
+    return scores
+
+
+def soft_nms(proposals, alpha, low_threshold, high_threshold, top_k):
+    """Soft NMS for temporal proposals.
+
+    Args:
+        proposals (np.ndarray): Proposals generated by network.
+        alpha (float): Alpha value of Gaussian decaying function.
+        low_threshold (float): Low threshold for soft nms.
+        high_threshold (float): High threshold for soft nms.
+        top_k (int): Top k values to be considered.
+    Returns:
+        np.ndarray: The updated proposals.
+    """
+    proposals = proposals[proposals[:, -1].argsort()[::-1]]
+    tstart = list(proposals[:, 0])
+    tend = list(proposals[:, 1])
+    tscore = list(proposals[:, -1])
+    rstart = []
+    rend = []
+    rscore = []
+
+    while len(tscore) > 0 and len(rscore) <= top_k:
+        max_index = np.argmax(tscore)
+        max_width = tend[max_index] - tstart[max_index]
+        iou_list = temporal_iou(tstart[max_index], tend[max_index],
+                                np.array(tstart), np.array(tend))
+        iou_exp_list = np.exp(-np.square(iou_list) / alpha)
+
+        for idx, _ in enumerate(tscore):
+            if idx != max_index:
+                current_iou = iou_list[idx]
+                if current_iou > low_threshold + (high_threshold -
+                                                  low_threshold) * max_width:
+                    tscore[idx] = tscore[idx] * iou_exp_list[idx]
+
+        rstart.append(tstart[max_index])
+        rend.append(tend[max_index])
+        rscore.append(tscore[max_index])
+        tstart.pop(max_index)
+        tend.pop(max_index)
+        tscore.pop(max_index)
+
+    rstart = np.array(rstart).reshape(-1, 1)
+    rend = np.array(rend).reshape(-1, 1)
+    rscore = np.array(rscore).reshape(-1, 1)
+    new_proposals = np.concatenate((rstart, rend, rscore), axis=1)
+    return new_proposals
+
+
+def post_processing(result, video_info, soft_nms_alpha, soft_nms_low_threshold,
+                    soft_nms_high_threshold, post_process_top_k,
+                    feature_extraction_interval):
+    """Post process for temporal proposals generation.
+    Args:
+        result (np.ndarray): Proposals generated by network.
+        video_info (dict): Meta data of video. Required keys are
+            'duration_frame', 'duration_second'.
+        soft_nms_alpha (float): Alpha value of Gaussian decaying function.
+        soft_nms_low_threshold (float): Low threshold for soft nms.
+        soft_nms_high_threshold (float): High threshold for soft nms.
+        post_process_top_k (int): Top k values to be considered.
+        feature_extraction_interval (int): Interval used in feature extraction.
+    Returns:
+        list[dict]: The updated proposals, e.g.
+            [{'score': 0.9, 'segment': [0, 1]},
+             {'score': 0.8, 'segment': [0, 2]},
+            ...].
+    """
+    if len(result) > 1:
+        result = soft_nms(result, soft_nms_alpha, soft_nms_low_threshold,
+                          soft_nms_high_threshold, post_process_top_k)
+
+    result = result[result[:, -1].argsort()[::-1]]
+    video_duration = float(
+        video_info['duration_frame'] // feature_extraction_interval *
+        feature_extraction_interval
+    ) / video_info['duration_frame'] * video_info['duration_second']
+    proposal_list = []
+
+    for j in range(min(post_process_top_k, len(result))):
+        proposal = {}
+        proposal['score'] = float(result[j, -1])
+        proposal['segment'] = [
+            max(0, result[j, 0]) * video_duration,
+            min(1, result[j, 1]) * video_duration
+        ]
+        proposal_list.append(proposal)
+    return proposal_list
diff --git a/mmaction/models/localizers/utils/tcanet_utils.py b/mmaction/models/localizers/utils/tcanet_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fe387b43c523df346b457f09784a7555480747a
--- /dev/null
+++ b/mmaction/models/localizers/utils/tcanet_utils.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copied from
+# 'https://github.com/qinzhi-0110/'
+# 'Temporal-Context-Aggregation-Network-Pytorch/'
+# 'blob/main/utils.py'
+# TODO: refactor
+import torch
+
+
+def batch_iou(proposals, gt_boxes):
+    len_proposals = proposals[:, 1] - proposals[:, 0]
+    int_xmin = torch.max(proposals[:, 0], gt_boxes[:, 0])
+    int_xmax = torch.min(proposals[:, 1], gt_boxes[:, 1])
+    inter_len = torch.clamp(int_xmax - int_xmin, min=0.)
+    union_len = len_proposals - inter_len + gt_boxes[:, 1] - gt_boxes[:, 0]
+    jaccard = inter_len / (union_len + 0.00001)
+    return jaccard
+
+
+def bbox_xw_transform_inv(boxes, deltas, dx_w, dw_w):
+    widths = boxes[:, 1] - boxes[:, 0]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+
+    dx = deltas[:, 0] * dx_w
+    dw = deltas[:, 1] * dw_w
+
+    pred_ctr_x = dx * widths + ctr_x
+    pred_w = torch.exp(dw) * widths
+
+    pred_boxes = deltas.clone()
+    # x1
+    pred_boxes[:, 0] = pred_ctr_x - 0.5 * pred_w
+    # x2
+    pred_boxes[:, 1] = pred_ctr_x + 0.5 * pred_w
+
+    return pred_boxes
+
+
+def bbox_xw_transform_batch(ex_rois, gt_rois):
+    ex_widths = torch.clamp(ex_rois[:, 1] - ex_rois[:, 0], min=0.00001)
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
+
+    gt_widths = torch.clamp(gt_rois[:, 1] - gt_rois[:, 0], min=0.00001)
+    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
+
+    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
+    targets_dw = torch.log(gt_widths / ex_widths)
+    targets = torch.stack((targets_dx, targets_dw), dim=1)
+    return targets
+
+
+def bbox_se_transform_batch(ex_rois, gt_rois):
+    ex_widths = torch.clamp(ex_rois[:, 1] - ex_rois[:, 0], min=0.00001)
+
+    s_offset = gt_rois[:, 0] - ex_rois[:, 0]
+    e_offset = gt_rois[:, 1] - ex_rois[:, 1]
+
+    targets_s = s_offset / ex_widths
+    targets_e = e_offset / ex_widths
+    targets = torch.stack((targets_s, targets_e), dim=1)
+    return targets
+
+
+def bbox_se_transform_inv(boxes, deltas, dse_w):
+    widths = boxes[:, 1] - boxes[:, 0]
+    s_offset = deltas[:, 0] * widths * dse_w
+    e_offset = deltas[:, 1] * widths * dse_w
+    pred_boxes = deltas.clone()
+    pred_boxes[:, 0] = boxes[:, 0] + s_offset
+    pred_boxes[:, 1] = boxes[:, 1] + e_offset
+    return pred_boxes
diff --git a/mmaction/models/losses/__init__.py b/mmaction/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d137fe010c490299e4bcc3fbe7fcf1f9f59d694d
--- /dev/null
+++ b/mmaction/models/losses/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseWeightedLoss
+from .binary_logistic_regression_loss import BinaryLogisticRegressionLoss
+from .bmn_loss import BMNLoss
+from .cross_entropy_loss import (BCELossWithLogits, CBFocalLoss,
+                                 CrossEntropyLoss)
+from .hvu_loss import HVULoss
+from .nll_loss import NLLLoss
+from .ohem_hinge_loss import OHEMHingeLoss
+from .ssn_loss import SSNLoss
+
+__all__ = [
+    'BaseWeightedLoss', 'CrossEntropyLoss', 'NLLLoss', 'BCELossWithLogits',
+    'BinaryLogisticRegressionLoss', 'BMNLoss', 'OHEMHingeLoss', 'SSNLoss',
+    'HVULoss', 'CBFocalLoss'
+]
diff --git a/mmaction/models/losses/__pycache__/__init__.cpython-312.pyc b/mmaction/models/losses/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28dd216a353a0391d458507a198fa27294ff5ba8
Binary files /dev/null and b/mmaction/models/losses/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/models/losses/__pycache__/base.cpython-312.pyc b/mmaction/models/losses/__pycache__/base.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59db3adcae3fac6c08cdff65e6f2eacbf4e7c23f
Binary files /dev/null and b/mmaction/models/losses/__pycache__/base.cpython-312.pyc differ
diff --git a/mmaction/models/losses/__pycache__/binary_logistic_regression_loss.cpython-312.pyc b/mmaction/models/losses/__pycache__/binary_logistic_regression_loss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a309de860c61c790bfe4422831d9c8628bd50f1
Binary files /dev/null and b/mmaction/models/losses/__pycache__/binary_logistic_regression_loss.cpython-312.pyc differ
diff --git a/mmaction/models/losses/__pycache__/bmn_loss.cpython-312.pyc b/mmaction/models/losses/__pycache__/bmn_loss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f0a8e36889d3afdc1bf8cd301343f9e49ffdb89
Binary files /dev/null and b/mmaction/models/losses/__pycache__/bmn_loss.cpython-312.pyc differ
diff --git a/mmaction/models/losses/__pycache__/cross_entropy_loss.cpython-312.pyc b/mmaction/models/losses/__pycache__/cross_entropy_loss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc8ad28991c8e35e0bed62d728f999330f0ac6c9
Binary files /dev/null and b/mmaction/models/losses/__pycache__/cross_entropy_loss.cpython-312.pyc differ
diff --git a/mmaction/models/losses/__pycache__/hvu_loss.cpython-312.pyc b/mmaction/models/losses/__pycache__/hvu_loss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..047c08d81ed15386ead959d7942309a1ccd9f4b5
Binary files /dev/null and b/mmaction/models/losses/__pycache__/hvu_loss.cpython-312.pyc differ
diff --git a/mmaction/models/losses/__pycache__/nll_loss.cpython-312.pyc b/mmaction/models/losses/__pycache__/nll_loss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..308a3f1ba6d3d06b514e54851f01484b57f4b687
Binary files /dev/null and b/mmaction/models/losses/__pycache__/nll_loss.cpython-312.pyc differ
diff --git a/mmaction/models/losses/__pycache__/ohem_hinge_loss.cpython-312.pyc b/mmaction/models/losses/__pycache__/ohem_hinge_loss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9273fc3e5904edda488b689bcba8a99d4793ad24
Binary files /dev/null and b/mmaction/models/losses/__pycache__/ohem_hinge_loss.cpython-312.pyc differ
diff --git a/mmaction/models/losses/__pycache__/ssn_loss.cpython-312.pyc b/mmaction/models/losses/__pycache__/ssn_loss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95e30a766d48fb275d2fcec1c6684066aabc631f
Binary files /dev/null and b/mmaction/models/losses/__pycache__/ssn_loss.cpython-312.pyc differ
diff --git a/mmaction/models/losses/base.py b/mmaction/models/losses/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fd3c797e3a500154221293016b75e47bb208e3f
--- /dev/null
+++ b/mmaction/models/losses/base.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import torch.nn as nn
+
+
+class BaseWeightedLoss(nn.Module, metaclass=ABCMeta):
+    """Base class for loss.
+
+    All subclass should overwrite the ``_forward()`` method which returns the
+    normal loss without loss weights.
+
+    Args:
+        loss_weight (float): Factor scalar multiplied on the loss.
+            Default: 1.0.
+    """
+
+    def __init__(self, loss_weight=1.0):
+        super().__init__()
+        self.loss_weight = loss_weight
+
+    @abstractmethod
+    def _forward(self, *args, **kwargs):
+        """Forward function."""
+        pass
+
+    def forward(self, *args, **kwargs):
+        """Defines the computation performed at every call.
+
+        Args:
+            *args: The positional arguments for the corresponding
+                loss.
+            **kwargs: The keyword arguments for the corresponding
+                loss.
+
+        Returns:
+            torch.Tensor: The calculated loss.
+        """
+        ret = self._forward(*args, **kwargs)
+        if isinstance(ret, dict):
+            for k in ret:
+                if 'loss' in k:
+                    ret[k] *= self.loss_weight
+        else:
+            ret *= self.loss_weight
+        return ret
diff --git a/mmaction/models/losses/binary_logistic_regression_loss.py b/mmaction/models/losses/binary_logistic_regression_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..537d03f12f9957aaa5b9dec7f737d2a93f6f1bb9
--- /dev/null
+++ b/mmaction/models/losses/binary_logistic_regression_loss.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.registry import MODELS
+
+
+def binary_logistic_regression_loss(reg_score,
+                                    label,
+                                    threshold=0.5,
+                                    ratio_range=(1.05, 21),
+                                    eps=1e-5):
+    """Binary Logistic Regression Loss."""
+    label = label.view(-1).to(reg_score.device)
+    reg_score = reg_score.contiguous().view(-1)
+
+    pmask = (label > threshold).float().to(reg_score.device)
+    num_positive = max(torch.sum(pmask), 1)
+    num_entries = len(label)
+    ratio = num_entries / num_positive
+    # clip ratio value between ratio_range
+    ratio = min(max(ratio, ratio_range[0]), ratio_range[1])
+
+    coef_0 = 0.5 * ratio / (ratio - 1)
+    coef_1 = 0.5 * ratio
+    loss = coef_1 * pmask * torch.log(reg_score + eps) + coef_0 * (
+        1.0 - pmask) * torch.log(1.0 - reg_score + eps)
+    loss = -torch.mean(loss)
+    return loss
+
+
+@MODELS.register_module()
+class BinaryLogisticRegressionLoss(nn.Module):
+    """Binary Logistic Regression Loss.
+
+    It will calculate binary logistic regression loss given reg_score and
+    label.
+    """
+
+    def forward(self,
+                reg_score,
+                label,
+                threshold=0.5,
+                ratio_range=(1.05, 21),
+                eps=1e-5):
+        """Calculate Binary Logistic Regression Loss.
+
+        Args:
+                reg_score (torch.Tensor): Predicted score by model.
+                label (torch.Tensor): Groundtruth labels.
+                threshold (float): Threshold for positive instances.
+                    Default: 0.5.
+                ratio_range (tuple): Lower bound and upper bound for ratio.
+                    Default: (1.05, 21)
+                eps (float): Epsilon for small value. Default: 1e-5.
+
+        Returns:
+                torch.Tensor: Returned binary logistic loss.
+        """
+
+        return binary_logistic_regression_loss(reg_score, label, threshold,
+                                               ratio_range, eps)
diff --git a/mmaction/models/losses/bmn_loss.py b/mmaction/models/losses/bmn_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..df9bacc842c2e22cd2149373bd882466e2b9f150
--- /dev/null
+++ b/mmaction/models/losses/bmn_loss.py
@@ -0,0 +1,181 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmaction.registry import MODELS
+from .binary_logistic_regression_loss import binary_logistic_regression_loss
+
+
+@MODELS.register_module()
+class BMNLoss(nn.Module):
+    """BMN Loss.
+
+    From paper https://arxiv.org/abs/1907.09702,
+    code https://github.com/JJBOY/BMN-Boundary-Matching-Network.
+    It will calculate loss for BMN Model. This loss is a weighted sum of
+
+        1) temporal evaluation loss based on confidence score of start and
+        end positions.
+        2) proposal evaluation regression loss based on confidence scores of
+        candidate proposals.
+        3) proposal evaluation classification loss based on classification
+        results of candidate proposals.
+    """
+
+    @staticmethod
+    def tem_loss(pred_start, pred_end, gt_start, gt_end):
+        """Calculate Temporal Evaluation Module Loss.
+
+        This function calculate the binary_logistic_regression_loss for start
+        and end respectively and returns the sum of their losses.
+
+        Args:
+            pred_start (torch.Tensor): Predicted start score by BMN model.
+            pred_end (torch.Tensor): Predicted end score by BMN model.
+            gt_start (torch.Tensor): Groundtruth confidence score for start.
+            gt_end (torch.Tensor): Groundtruth confidence score for end.
+
+        Returns:
+            torch.Tensor: Returned binary logistic loss.
+        """
+        loss_start = binary_logistic_regression_loss(pred_start, gt_start)
+        loss_end = binary_logistic_regression_loss(pred_end, gt_end)
+        loss = loss_start + loss_end
+        return loss
+
+    @staticmethod
+    def pem_reg_loss(pred_score,
+                     gt_iou_map,
+                     mask,
+                     high_temporal_iou_threshold=0.7,
+                     low_temporal_iou_threshold=0.3):
+        """Calculate Proposal Evaluation Module Regression Loss.
+
+        Args:
+            pred_score (torch.Tensor): Predicted temporal_iou score by BMN.
+            gt_iou_map (torch.Tensor): Groundtruth temporal_iou score.
+            mask (torch.Tensor): Boundary-Matching mask.
+            high_temporal_iou_threshold (float): Higher threshold of
+                temporal_iou. Default: 0.7.
+            low_temporal_iou_threshold (float): Higher threshold of
+                temporal_iou. Default: 0.3.
+
+        Returns:
+            torch.Tensor: Proposal evaluation regression loss.
+        """
+        u_hmask = (gt_iou_map > high_temporal_iou_threshold).float()
+        u_mmask = ((gt_iou_map <= high_temporal_iou_threshold) &
+                   (gt_iou_map > low_temporal_iou_threshold)).float()
+        u_lmask = ((gt_iou_map <= low_temporal_iou_threshold) &
+                   (gt_iou_map > 0.)).float()
+        u_lmask = u_lmask * mask
+
+        num_h = torch.sum(u_hmask)
+        num_m = torch.sum(u_mmask)
+        num_l = torch.sum(u_lmask)
+
+        r_m = num_h / num_m
+        u_smmask = torch.rand_like(gt_iou_map)
+        u_smmask = u_mmask * u_smmask
+        u_smmask = (u_smmask > (1. - r_m)).float()
+
+        r_l = num_h / num_l
+        u_slmask = torch.rand_like(gt_iou_map)
+        u_slmask = u_lmask * u_slmask
+        u_slmask = (u_slmask > (1. - r_l)).float()
+
+        weights = u_hmask + u_smmask + u_slmask
+
+        loss = F.mse_loss(pred_score * weights, gt_iou_map * weights)
+        loss = 0.5 * torch.sum(
+            loss * torch.ones_like(weights)) / torch.sum(weights)
+
+        return loss
+
+    @staticmethod
+    def pem_cls_loss(pred_score,
+                     gt_iou_map,
+                     mask,
+                     threshold=0.9,
+                     ratio_range=(1.05, 21),
+                     eps=1e-5):
+        """Calculate Proposal Evaluation Module Classification Loss.
+
+        Args:
+            pred_score (torch.Tensor): Predicted temporal_iou score by BMN.
+            gt_iou_map (torch.Tensor): Groundtruth temporal_iou score.
+            mask (torch.Tensor): Boundary-Matching mask.
+            threshold (float): Threshold of temporal_iou for positive
+                instances. Default: 0.9.
+            ratio_range (tuple): Lower bound and upper bound for ratio.
+                Default: (1.05, 21)
+            eps (float): Epsilon for small value. Default: 1e-5
+
+        Returns:
+            torch.Tensor: Proposal evaluation classification loss.
+        """
+        pmask = (gt_iou_map > threshold).float()
+        nmask = (gt_iou_map <= threshold).float()
+        nmask = nmask * mask
+
+        num_positive = max(torch.sum(pmask), 1)
+        num_entries = num_positive + torch.sum(nmask)
+        ratio = num_entries / num_positive
+        ratio = torch.clamp(ratio, ratio_range[0], ratio_range[1])
+
+        coef_0 = 0.5 * ratio / (ratio - 1)
+        coef_1 = 0.5 * ratio
+
+        loss_pos = coef_1 * torch.log(pred_score + eps) * pmask
+        loss_neg = coef_0 * torch.log(1.0 - pred_score + eps) * nmask
+        loss = -1 * torch.sum(loss_pos + loss_neg) / num_entries
+        return loss
+
+    def forward(self,
+                pred_bm,
+                pred_start,
+                pred_end,
+                gt_iou_map,
+                gt_start,
+                gt_end,
+                bm_mask,
+                weight_tem=1.0,
+                weight_pem_reg=10.0,
+                weight_pem_cls=1.0):
+        """Calculate Boundary Matching Network Loss.
+
+        Args:
+            pred_bm (torch.Tensor): Predicted confidence score for boundary
+                matching map.
+            pred_start (torch.Tensor): Predicted confidence score for start.
+            pred_end (torch.Tensor): Predicted confidence score for end.
+            gt_iou_map (torch.Tensor): Groundtruth score for boundary matching
+                map.
+            gt_start (torch.Tensor): Groundtruth temporal_iou score for start.
+            gt_end (torch.Tensor): Groundtruth temporal_iou score for end.
+            bm_mask (torch.Tensor): Boundary-Matching mask.
+            weight_tem (float): Weight for tem loss. Default: 1.0.
+            weight_pem_reg (float): Weight for pem regression loss.
+                Default: 10.0.
+            weight_pem_cls (float): Weight for pem classification loss.
+                Default: 1.0.
+
+        Returns:
+            tuple([torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
+                (loss, tem_loss, pem_reg_loss, pem_cls_loss). Loss is the bmn
+                loss, tem_loss is the temporal evaluation loss, pem_reg_loss is
+                the proposal evaluation regression loss, pem_cls_loss is the
+                proposal evaluation classification loss.
+        """
+        pred_bm_reg = pred_bm[:, 0].contiguous()
+        pred_bm_cls = pred_bm[:, 1].contiguous()
+        gt_iou_map = gt_iou_map * bm_mask
+
+        pem_reg_loss = self.pem_reg_loss(pred_bm_reg, gt_iou_map, bm_mask)
+        pem_cls_loss = self.pem_cls_loss(pred_bm_cls, gt_iou_map, bm_mask)
+        tem_loss = self.tem_loss(pred_start, pred_end, gt_start, gt_end)
+        loss = (
+            weight_tem * tem_loss + weight_pem_reg * pem_reg_loss +
+            weight_pem_cls * pem_cls_loss)
+        return loss, tem_loss, pem_reg_loss, pem_cls_loss
diff --git a/mmaction/models/losses/cross_entropy_loss.py b/mmaction/models/losses/cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..7324f20619851c0fa06b64458f14720175ae8229
--- /dev/null
+++ b/mmaction/models/losses/cross_entropy_loss.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from mmaction.registry import MODELS
+from .base import BaseWeightedLoss
+
+
+@MODELS.register_module()
+class CrossEntropyLoss(BaseWeightedLoss):
+    """Cross Entropy Loss.
+
+    Support two kinds of labels and their corresponding loss type. It's worth
+    mentioning that loss type will be detected by the shape of ``cls_score``
+    and ``label``.
+    1) Hard label: This label is an integer array and all of the elements are
+        in the range [0, num_classes - 1]. This label's shape should be
+        ``cls_score``'s shape with the `num_classes` dimension removed.
+    2) Soft label(probability distribution over classes): This label is a
+        probability distribution and all of the elements are in the range
+        [0, 1]. This label's shape must be the same as ``cls_score``. For now,
+        only 2-dim soft label is supported.
+
+    Args:
+        loss_weight (float): Factor scalar multiplied on the loss.
+            Defaults to 1.0.
+        class_weight (list[float] | None): Loss weight for each class. If set
+            as None, use the same weight 1 for all classes. Only applies
+            to CrossEntropyLoss and BCELossWithLogits (should not be set when
+            using other losses). Defaults to None.
+    """
+
+    def __init__(self,
+                 loss_weight: float = 1.0,
+                 class_weight: Optional[List[float]] = None) -> None:
+        super().__init__(loss_weight=loss_weight)
+        self.class_weight = None
+        if class_weight is not None:
+            self.class_weight = torch.Tensor(class_weight)
+
+    def _forward(self, cls_score: torch.Tensor, label: torch.Tensor,
+                 **kwargs) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The class score.
+            label (torch.Tensor): The ground truth label.
+            kwargs: Any keyword argument to be used to calculate
+                CrossEntropy loss.
+
+        Returns:
+            torch.Tensor: The returned CrossEntropy loss.
+        """
+        if cls_score.size() == label.size():
+            # calculate loss for soft label
+
+            assert cls_score.dim() == 2, 'Only support 2-dim soft label'
+            assert len(kwargs) == 0, \
+                ('For now, no extra args are supported for soft label, '
+                 f'but get {kwargs}')
+
+            lsm = F.log_softmax(cls_score, 1)
+            if self.class_weight is not None:
+                self.class_weight = self.class_weight.to(cls_score.device)
+                lsm = lsm * self.class_weight.unsqueeze(0)
+            loss_cls = -(label * lsm).sum(1)
+
+            # default reduction 'mean'
+            if self.class_weight is not None:
+                # Use weighted average as pytorch CrossEntropyLoss does.
+                # For more information, please visit https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html # noqa
+                loss_cls = loss_cls.sum() / torch.sum(
+                    self.class_weight.unsqueeze(0) * label)
+            else:
+                loss_cls = loss_cls.mean()
+        else:
+            # calculate loss for hard label
+
+            if self.class_weight is not None:
+                assert 'weight' not in kwargs, \
+                    "The key 'weight' already exists."
+                kwargs['weight'] = self.class_weight.to(cls_score.device)
+            loss_cls = F.cross_entropy(cls_score, label, **kwargs)
+
+        return loss_cls
+
+
+@MODELS.register_module()
+class BCELossWithLogits(BaseWeightedLoss):
+    """Binary Cross Entropy Loss with logits.
+
+    Args:
+        loss_weight (float): Factor scalar multiplied on the loss.
+            Defaults to 1.0.
+        class_weight (list[float] | None): Loss weight for each class. If set
+            as None, use the same weight 1 for all classes. Only applies
+            to CrossEntropyLoss and BCELossWithLogits (should not be set when
+            using other losses). Defaults to None.
+    """
+
+    def __init__(self,
+                 loss_weight: float = 1.0,
+                 class_weight: Optional[List[float]] = None) -> None:
+        super().__init__(loss_weight=loss_weight)
+        self.class_weight = None
+        if class_weight is not None:
+            self.class_weight = torch.Tensor(class_weight)
+
+    def _forward(self, cls_score: torch.Tensor, label: torch.Tensor,
+                 **kwargs) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The class score.
+            label (torch.Tensor): The ground truth label.
+            kwargs: Any keyword argument to be used to calculate
+                bce loss with logits.
+
+        Returns:
+            torch.Tensor: The returned bce loss with logits.
+        """
+        if self.class_weight is not None:
+            assert 'weight' not in kwargs, "The key 'weight' already exists."
+            kwargs['weight'] = self.class_weight.to(cls_score.device)
+        loss_cls = F.binary_cross_entropy_with_logits(cls_score, label,
+                                                      **kwargs)
+        return loss_cls
+
+
+@MODELS.register_module()
+class CBFocalLoss(BaseWeightedLoss):
+    """Class Balanced Focal Loss. Adapted from https://github.com/abhinanda-
+    punnakkal/BABEL/. This loss is used in the skeleton-based action
+    recognition baseline for BABEL.
+
+    Args:
+        loss_weight (float): Factor scalar multiplied on the loss.
+            Defaults to 1.0.
+        samples_per_cls (list[int]): The number of samples per class.
+            Defaults to [].
+        beta (float): Hyperparameter that controls the per class loss weight.
+            Defaults to 0.9999.
+        gamma (float): Hyperparameter of the focal loss. Defaults to 2.0.
+    """
+
+    def __init__(self,
+                 loss_weight: float = 1.0,
+                 samples_per_cls: List[int] = [],
+                 beta: float = 0.9999,
+                 gamma: float = 2.) -> None:
+        super().__init__(loss_weight=loss_weight)
+        self.samples_per_cls = samples_per_cls
+        self.beta = beta
+        self.gamma = gamma
+        effective_num = 1.0 - np.power(beta, samples_per_cls)
+        weights = (1.0 - beta) / np.array(effective_num)
+        weights = weights / np.sum(weights) * len(weights)
+        self.weights = weights
+        self.num_classes = len(weights)
+
+    def _forward(self, cls_score: torch.Tensor, label: torch.Tensor,
+                 **kwargs) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The class score.
+            label (torch.Tensor): The ground truth label.
+            kwargs: Any keyword argument to be used to calculate
+                bce loss with logits.
+
+        Returns:
+            torch.Tensor: The returned bce loss with logits.
+        """
+        weights = torch.tensor(self.weights).float().to(cls_score.device)
+        label_one_hot = F.one_hot(label, self.num_classes).float()
+        weights = weights.unsqueeze(0)
+        weights = weights.repeat(label_one_hot.shape[0], 1) * label_one_hot
+        weights = weights.sum(1)
+        weights = weights.unsqueeze(1)
+        weights = weights.repeat(1, self.num_classes)
+
+        BCELoss = F.binary_cross_entropy_with_logits(
+            input=cls_score, target=label_one_hot, reduction='none')
+
+        modulator = 1.0
+        if self.gamma:
+            modulator = torch.exp(-self.gamma * label_one_hot * cls_score -
+                                  self.gamma *
+                                  torch.log(1 + torch.exp(-1.0 * cls_score)))
+
+        loss = modulator * BCELoss
+        weighted_loss = weights * loss
+
+        focal_loss = torch.sum(weighted_loss)
+        focal_loss /= torch.sum(label_one_hot)
+
+        return focal_loss
diff --git a/mmaction/models/losses/hvu_loss.py b/mmaction/models/losses/hvu_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a79803fbcfc4eebfa8b4b89f7b4a98343533f911
--- /dev/null
+++ b/mmaction/models/losses/hvu_loss.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+from mmengine.device import get_device
+
+from mmaction.registry import MODELS
+from .base import BaseWeightedLoss
+
+
+@MODELS.register_module()
+class HVULoss(BaseWeightedLoss):
+    """Calculate the BCELoss for HVU.
+
+    Args:
+        categories (tuple[str]): Names of tag categories, tags are organized in
+            this order. Default: ['action', 'attribute', 'concept', 'event',
+            'object', 'scene'].
+        category_nums (tuple[int]): Number of tags for each category. Default:
+            (739, 117, 291, 69, 1678, 248).
+        category_loss_weights (tuple[float]): Loss weights of categories, it
+            applies only if `loss_type == 'individual'`. The loss weights will
+            be normalized so that the sum equals to 1, so that you can give any
+            positive number as loss weight. Default: (1, 1, 1, 1, 1, 1).
+        loss_type (str): The loss type we calculate, we can either calculate
+            the BCELoss for all tags, or calculate the BCELoss for tags in each
+            category. Choices are 'individual' or 'all'. Default: 'all'.
+        with_mask (bool): Since some tag categories are missing for some video
+            clips. If `with_mask == True`, we will not calculate loss for these
+            missing categories. Otherwise, these missing categories are treated
+            as negative samples.
+        reduction (str): Reduction way. Choices are 'mean' or 'sum'. Default:
+            'mean'.
+        loss_weight (float): The loss weight. Default: 1.0.
+    """
+
+    def __init__(self,
+                 categories=('action', 'attribute', 'concept', 'event',
+                             'object', 'scene'),
+                 category_nums=(739, 117, 291, 69, 1678, 248),
+                 category_loss_weights=(1, 1, 1, 1, 1, 1),
+                 loss_type='all',
+                 with_mask=False,
+                 reduction='mean',
+                 loss_weight=1.0):
+
+        super().__init__(loss_weight)
+        self.categories = categories
+        self.category_nums = category_nums
+        self.category_loss_weights = category_loss_weights
+        assert len(self.category_nums) == len(self.category_loss_weights)
+        for category_loss_weight in self.category_loss_weights:
+            assert category_loss_weight >= 0
+        self.loss_type = loss_type
+        self.with_mask = with_mask
+        self.reduction = reduction
+        self.category_startidx = [0]
+        for i in range(len(self.category_nums) - 1):
+            self.category_startidx.append(self.category_startidx[-1] +
+                                          self.category_nums[i])
+        assert self.loss_type in ['individual', 'all']
+        assert self.reduction in ['mean', 'sum']
+
+    def _forward(self, cls_score, label, mask, category_mask):
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The class score.
+            label (torch.Tensor): The ground truth label.
+            mask (torch.Tensor): The mask of tags. 0 indicates that the
+                category of this tag is missing in the label of the video.
+            category_mask (torch.Tensor): The category mask. For each sample,
+                it's a tensor with length `len(self.categories)`, denotes that
+                if the category is labeled for this video.
+
+        Returns:
+            torch.Tensor: The returned CrossEntropy loss.
+        """
+
+        if self.loss_type == 'all':
+            loss_cls = F.binary_cross_entropy_with_logits(
+                cls_score, label, reduction='none')
+            if self.with_mask:
+                w_loss_cls = mask * loss_cls
+                w_loss_cls = torch.sum(w_loss_cls, dim=1)
+                if self.reduction == 'mean':
+                    w_loss_cls = w_loss_cls / torch.sum(mask, dim=1)
+                w_loss_cls = torch.mean(w_loss_cls)
+                return dict(loss_cls=w_loss_cls)
+
+            if self.reduction == 'sum':
+                loss_cls = torch.sum(loss_cls, dim=-1)
+            return dict(loss_cls=torch.mean(loss_cls))
+
+        if self.loss_type == 'individual':
+            losses = {}
+            loss_weights = {}
+            for name, num, start_idx in zip(self.categories,
+                                            self.category_nums,
+                                            self.category_startidx):
+                category_score = cls_score[:, start_idx:start_idx + num]
+                category_label = label[:, start_idx:start_idx + num]
+                category_loss = F.binary_cross_entropy_with_logits(
+                    category_score, category_label, reduction='none')
+                if self.reduction == 'mean':
+                    category_loss = torch.mean(category_loss, dim=1)
+                elif self.reduction == 'sum':
+                    category_loss = torch.sum(category_loss, dim=1)
+
+                idx = self.categories.index(name)
+                if self.with_mask:
+                    category_mask_i = category_mask[:, idx].reshape(-1)
+                    # there should be at least one sample which contains tags
+                    # in this category
+                    if torch.sum(category_mask_i) < 0.5:
+                        losses[f'{name}_LOSS'] = torch.tensor(
+                            .0, device=get_device())
+                        loss_weights[f'{name}_LOSS'] = .0
+                        continue
+                    category_loss = torch.sum(category_loss * category_mask_i)
+                    category_loss = category_loss / torch.sum(category_mask_i)
+                else:
+                    category_loss = torch.mean(category_loss)
+                # We name the loss of each category as 'LOSS', since we only
+                # want to monitor them, not backward them. We will also provide
+                # the loss used for backward in the losses dictionary
+                losses[f'{name}_LOSS'] = category_loss
+                loss_weights[f'{name}_LOSS'] = self.category_loss_weights[idx]
+            loss_weight_sum = sum(loss_weights.values())
+            loss_weights = {
+                k: v / loss_weight_sum
+                for k, v in loss_weights.items()
+            }
+            loss_cls = sum([losses[k] * loss_weights[k] for k in losses])
+            losses['loss_cls'] = loss_cls
+            # We also trace the loss weights
+            losses.update({
+                k + '_weight': torch.tensor(v).to(losses[k].device)
+                for k, v in loss_weights.items()
+            })
+            # Note that the loss weights are just for reference.
+            return losses
+        else:
+            raise ValueError("loss_type should be 'all' or 'individual', "
+                             f'but got {self.loss_type}')
diff --git a/mmaction/models/losses/nll_loss.py b/mmaction/models/losses/nll_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..48577199464540ee612387684928490e0d0d7bb1
--- /dev/null
+++ b/mmaction/models/losses/nll_loss.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn.functional as F
+
+from mmaction.registry import MODELS
+from .base import BaseWeightedLoss
+
+
+@MODELS.register_module()
+class NLLLoss(BaseWeightedLoss):
+    """NLL Loss.
+
+    It will calculate NLL loss given cls_score and label.
+    """
+
+    def _forward(self, cls_score, label, **kwargs):
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The class score.
+            label (torch.Tensor): The ground truth label.
+            kwargs: Any keyword argument to be used to calculate nll loss.
+
+        Returns:
+            torch.Tensor: The returned nll loss.
+        """
+        loss_cls = F.nll_loss(cls_score, label, **kwargs)
+        return loss_cls
diff --git a/mmaction/models/losses/ohem_hinge_loss.py b/mmaction/models/losses/ohem_hinge_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d6da7c698e280c1ac0fbed95503816381927a3
--- /dev/null
+++ b/mmaction/models/losses/ohem_hinge_loss.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+class OHEMHingeLoss(torch.autograd.Function):
+    """This class is the core implementation for the completeness loss in
+    paper.
+
+    It compute class-wise hinge loss and performs online hard example mining
+    (OHEM).
+    """
+
+    @staticmethod
+    def forward(ctx, pred, labels, is_positive, ohem_ratio, group_size):
+        """Calculate OHEM hinge loss.
+
+        Args:
+            pred (torch.Tensor): Predicted completeness score.
+            labels (torch.Tensor): Groundtruth class label.
+            is_positive (int): Set to 1 when proposals are positive and
+                set to -1 when proposals are incomplete.
+            ohem_ratio (float): Ratio of hard examples.
+            group_size (int): Number of proposals sampled per video.
+
+        Returns:
+            torch.Tensor: Returned class-wise hinge loss.
+        """
+        num_samples = pred.size(0)
+        if num_samples != len(labels):
+            raise ValueError(f'Number of samples should be equal to that '
+                             f'of labels, but got {num_samples} samples and '
+                             f'{len(labels)} labels.')
+
+        losses = torch.zeros(num_samples, device=pred.device)
+        slopes = torch.zeros(num_samples, device=pred.device)
+        for i in range(num_samples):
+            losses[i] = max(0, 1 - is_positive * pred[i, labels[i] - 1])
+            slopes[i] = -is_positive if losses[i] != 0 else 0
+
+        losses = losses.view(-1, group_size).contiguous()
+        sorted_losses, indices = torch.sort(losses, dim=1, descending=True)
+        keep_length = int(group_size * ohem_ratio)
+        loss = torch.zeros(1, device=pred.device)
+        for i in range(losses.size(0)):
+            loss += sorted_losses[i, :keep_length].sum()
+        ctx.loss_index = indices[:, :keep_length]
+        ctx.labels = labels
+        ctx.slopes = slopes
+        ctx.shape = pred.size()
+        ctx.group_size = group_size
+        ctx.num_groups = losses.size(0)
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Defines a formula for differentiating the operation with backward
+        mode automatic differentiation."""
+
+        labels = ctx.labels
+        slopes = ctx.slopes
+
+        grad_in = torch.zeros(ctx.shape, device=ctx.slopes.device)
+        for group in range(ctx.num_groups):
+            for idx in ctx.loss_index[group]:
+                loc = idx + group * ctx.group_size
+                grad_in[loc, labels[loc] - 1] = (
+                    slopes[loc] * grad_output.data[0])
+        return torch.autograd.Variable(grad_in), None, None, None, None
diff --git a/mmaction/models/losses/ssn_loss.py b/mmaction/models/losses/ssn_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..82ac90deb29450cc28446e8f1427650e175ce797
--- /dev/null
+++ b/mmaction/models/losses/ssn_loss.py
@@ -0,0 +1,180 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmaction.registry import MODELS
+from .ohem_hinge_loss import OHEMHingeLoss
+
+
+@MODELS.register_module()
+class SSNLoss(nn.Module):
+
+    @staticmethod
+    def activity_loss(activity_score, labels, activity_indexer):
+        """Activity Loss.
+
+        It will calculate activity loss given activity_score and label.
+
+        Args：
+            activity_score (torch.Tensor): Predicted activity score.
+            labels (torch.Tensor): Groundtruth class label.
+            activity_indexer (torch.Tensor): Index slices of proposals.
+
+        Returns:
+            torch.Tensor: Returned cross entropy loss.
+        """
+        pred = activity_score[activity_indexer, :]
+        gt = labels[activity_indexer]
+        return F.cross_entropy(pred, gt)
+
+    @staticmethod
+    def completeness_loss(completeness_score,
+                          labels,
+                          completeness_indexer,
+                          positive_per_video,
+                          incomplete_per_video,
+                          ohem_ratio=0.17):
+        """Completeness Loss.
+
+        It will calculate completeness loss given completeness_score and label.
+
+        Args：
+            completeness_score (torch.Tensor): Predicted completeness score.
+            labels (torch.Tensor): Groundtruth class label.
+            completeness_indexer (torch.Tensor): Index slices of positive and
+                incomplete proposals.
+            positive_per_video (int): Number of positive proposals sampled
+                per video.
+            incomplete_per_video (int): Number of incomplete proposals sampled
+                pre video.
+            ohem_ratio (float): Ratio of online hard example mining.
+                Default: 0.17.
+
+        Returns:
+            torch.Tensor: Returned class-wise completeness loss.
+        """
+        pred = completeness_score[completeness_indexer, :]
+        gt = labels[completeness_indexer]
+
+        pred_dim = pred.size(1)
+        pred = pred.view(-1, positive_per_video + incomplete_per_video,
+                         pred_dim)
+        gt = gt.view(-1, positive_per_video + incomplete_per_video)
+
+        # yapf:disable
+        positive_pred = pred[:, :positive_per_video, :].contiguous().view(-1, pred_dim)  # noqa:E501
+        incomplete_pred = pred[:, positive_per_video:, :].contiguous().view(-1, pred_dim)  # noqa:E501
+        # yapf:enable
+
+        positive_loss = OHEMHingeLoss.apply(
+            positive_pred, gt[:, :positive_per_video].contiguous().view(-1), 1,
+            1.0, positive_per_video)
+        incomplete_loss = OHEMHingeLoss.apply(
+            incomplete_pred, gt[:, positive_per_video:].contiguous().view(-1),
+            -1, ohem_ratio, incomplete_per_video)
+        num_positives = positive_pred.size(0)
+        num_incompletes = int(incomplete_pred.size(0) * ohem_ratio)
+
+        return ((positive_loss + incomplete_loss) /
+                float(num_positives + num_incompletes))
+
+    @staticmethod
+    def classwise_regression_loss(bbox_pred, labels, bbox_targets,
+                                  regression_indexer):
+        """Classwise Regression Loss.
+
+        It will calculate classwise_regression loss given
+        class_reg_pred and targets.
+
+        Args：
+            bbox_pred (torch.Tensor): Predicted interval center and span
+                of positive proposals.
+            labels (torch.Tensor): Groundtruth class label.
+            bbox_targets (torch.Tensor): Groundtruth center and span
+                of positive proposals.
+            regression_indexer (torch.Tensor): Index slices of
+                positive proposals.
+
+        Returns:
+            torch.Tensor: Returned class-wise regression loss.
+        """
+        pred = bbox_pred[regression_indexer, :, :]
+        gt = labels[regression_indexer]
+        reg_target = bbox_targets[regression_indexer, :]
+
+        class_idx = gt.data - 1
+        classwise_pred = pred[:, class_idx, :]
+        classwise_reg_pred = torch.cat(
+            (torch.diag(classwise_pred[:, :, 0]).view(
+                -1, 1), torch.diag(classwise_pred[:, :, 1]).view(-1, 1)),
+            dim=1)
+        loss = F.smooth_l1_loss(
+            classwise_reg_pred.view(-1), reg_target.view(-1)) * 2
+        return loss
+
+    def forward(self, activity_score, completeness_score, bbox_pred,
+                proposal_type, labels, bbox_targets, train_cfg):
+        """Calculate Boundary Matching Network Loss.
+
+        Args:
+            activity_score (torch.Tensor): Predicted activity score.
+            completeness_score (torch.Tensor): Predicted completeness score.
+            bbox_pred (torch.Tensor): Predicted interval center and span
+                of positive proposals.
+            proposal_type (torch.Tensor): Type index slices of proposals.
+            labels (torch.Tensor): Groundtruth class label.
+            bbox_targets (torch.Tensor): Groundtruth center and span
+                of positive proposals.
+            train_cfg (dict): Config for training.
+
+        Returns:
+            dict([torch.Tensor, torch.Tensor, torch.Tensor]):
+                (loss_activity, loss_completeness, loss_reg).
+                Loss_activity is the activity loss, loss_completeness is
+                the class-wise completeness loss,
+                loss_reg is the class-wise regression loss.
+        """
+        self.sampler = train_cfg.ssn.sampler
+        self.loss_weight = train_cfg.ssn.loss_weight
+        losses = dict()
+
+        proposal_type = proposal_type.view(-1)
+        labels = labels.view(-1)
+        activity_indexer = ((proposal_type == 0) +
+                            (proposal_type == 2)).nonzero().squeeze(1)
+        completeness_indexer = ((proposal_type == 0) +
+                                (proposal_type == 1)).nonzero().squeeze(1)
+
+        total_ratio = (
+            self.sampler.positive_ratio + self.sampler.background_ratio +
+            self.sampler.incomplete_ratio)
+        positive_per_video = int(self.sampler.num_per_video *
+                                 (self.sampler.positive_ratio / total_ratio))
+        background_per_video = int(
+            self.sampler.num_per_video *
+            (self.sampler.background_ratio / total_ratio))
+        incomplete_per_video = (
+            self.sampler.num_per_video - positive_per_video -
+            background_per_video)
+
+        losses['loss_activity'] = self.activity_loss(activity_score, labels,
+                                                     activity_indexer)
+
+        losses['loss_completeness'] = self.completeness_loss(
+            completeness_score,
+            labels,
+            completeness_indexer,
+            positive_per_video,
+            incomplete_per_video,
+            ohem_ratio=positive_per_video / incomplete_per_video)
+        losses['loss_completeness'] *= self.loss_weight.comp_loss_weight
+
+        if bbox_pred is not None:
+            regression_indexer = (proposal_type == 0).nonzero().squeeze(1)
+            bbox_targets = bbox_targets.view(-1, 2)
+            losses['loss_reg'] = self.classwise_regression_loss(
+                bbox_pred, labels, bbox_targets, regression_indexer)
+            losses['loss_reg'] *= self.loss_weight.reg_loss_weight
+
+        return losses
diff --git a/mmaction/models/multimodal/__init__.py b/mmaction/models/multimodal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a34211fc5f01ae4ec882b4dd18b7fd854d61993
--- /dev/null
+++ b/mmaction/models/multimodal/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmaction.utils.dependency import WITH_MULTIMODAL
+
+if WITH_MULTIMODAL:
+    from .vindlu import *  # noqa: F401,F403
+
+else:
+    from mmaction.registry import MODELS
+    from mmaction.utils.dependency import register_multimodal_placeholder
+
+    register_multimodal_placeholder(
+        ['VindLUVQA', 'VindLURetrievalMC', 'VindLURetrieval'], MODELS)
diff --git a/mmaction/models/multimodal/__pycache__/__init__.cpython-312.pyc b/mmaction/models/multimodal/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57a4fdac57282118a8b8d5e93060e9f516e309d1
Binary files /dev/null and b/mmaction/models/multimodal/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/models/multimodal/vindlu/__init__.py b/mmaction/models/multimodal/vindlu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..78fb497a6bdef65dca416cf9ae97b091b52bfe0b
--- /dev/null
+++ b/mmaction/models/multimodal/vindlu/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .beit3d import BeitModel3D
+from .tokenizer import VindLUTokenizer
+from .vindlu_ret import VindLURetrieval
+from .vindlu_ret_mc import VindLURetrievalMC
+from .vindlu_vqa import VindLUVQA
+from .xbert import BertDecoder, BertModel
+
+__all__ = [
+    'VindLUVQA', 'VindLURetrievalMC', 'VindLURetrieval', 'VindLUTokenizer',
+    'BeitModel3D', 'BertDecoder', 'BertModel'
+]
diff --git a/mmaction/models/multimodal/vindlu/beit3d.py b/mmaction/models/multimodal/vindlu/beit3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e29f42cae779f0345c4869d9afdd42debddee13
--- /dev/null
+++ b/mmaction/models/multimodal/vindlu/beit3d.py
@@ -0,0 +1,350 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import importlib
+from typing import Dict, Optional, Tuple, Union
+
+import einops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.models.beit import BeitConfig, BeitModel
+from transformers.models.beit.modeling_beit import BeitAttention, BeitDropPath
+from transformers.models.beit.modeling_beit import \
+    BeitEmbeddings as BeitEmbeddings2D
+from transformers.models.beit.modeling_beit import BeitLayer as BeitLayer2D
+from transformers.models.beit.modeling_beit import BeitRelativePositionBias
+from transformers.models.beit.modeling_beit import \
+    BeitRelativePositionBias as BeitRelativePositionBias2D
+
+from mmaction.registry import MODELS
+from .temporal_model import (X_CLIP, STAdapter, TemporalAttention,
+                             WindowTemporalAttention)
+
+
+def interpolate_temporal_pos_embed(temp_embed_old, num_frames_new):
+    """
+    temp_embed_old: (1, num_frames_old, 1, d)
+    Returns:
+        temp_embed_new: (1, num_frames_new, 1, d)
+    """
+    temp_embed_old = temp_embed_old.squeeze(2).permute(
+        0, 2, 1)  # (1, d, num_frames_old)
+    temp_embed_new = F.interpolate(
+        temp_embed_old, num_frames_new,
+        mode='linear')  # (1, d, num_frames_new)
+    temp_embed_new = temp_embed_new.permute(0, 2, 1).unsqueeze(
+        2)  # (1, num_frames_new, 1, d)
+    return temp_embed_new
+
+
+class TemporalAttentionBeit(nn.Module):
+    """temporal attention using BeitAttention."""
+
+    def __init__(self, config: BeitConfig):
+        """TODO: to be defined."""
+        super().__init__()
+
+        self.layernorm_before = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = BeitAttention(config, window_size=None)
+        self.scale = nn.Parameter(
+            config.temporal_model_init_value * torch.ones(
+                (config.hidden_size)),
+            requires_grad=True,
+        )
+        self.drop_path = BeitDropPath(config.drop_path_rate)
+
+    def forward(self, hidden_states: torch.Tensor):
+        """forward function.
+
+        Args:
+            hidden_states (torch.Tensor): The input. Shape: [b,t,l,c]
+
+        Returns: TODO
+        """
+        b = hidden_states.shape[0]
+        output = einops.rearrange(hidden_states, 'b t l c -> (b l) t c')
+        output = self.layernorm_before(output)
+        output = self.attention(output)
+        output = einops.rearrange(output[0], '(b l) t c -> b t l c', b=b)
+        return hidden_states + self.drop_path(output[0]) * self.scale
+
+
+class BeitPooler3D(nn.Module):
+
+    def __init__(self, config: BeitConfig) -> None:
+        super().__init__()
+        self.num_prompts = config.add_k_prompts
+        self.layernorm = (
+            nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+            if config.use_mean_pooling else None)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (torch.Tensor): Shape: [B,T,L,C]
+        """
+        if self.layernorm is not None:
+            # Mean pool the final hidden states of the patch tokens
+            # patch_tokens = hidden_states[:, 1 + self.num_prompts :, :]
+            if self.num_prompts > 0:
+                patch_tokens = hidden_states[:, :, 1:-self.num_prompts, :]
+            else:
+                patch_tokens = hidden_states[:, :, 1:, :]
+            pooled_output = self.layernorm(patch_tokens.mean(2))
+        else:
+            # Pool by simply taking the final hidden state of the [CLS] token
+            pooled_output = hidden_states[:, :, 0]
+
+        return pooled_output
+
+
+class BeitRelativePositionBias3D(BeitRelativePositionBias2D):
+
+    def __init__(self, config: BeitConfig, window_size: tuple) -> None:
+        super().__init__(config, window_size)
+
+        # add bias for prompts
+        self.k = config.add_k_prompts
+        if self.k > 0:
+            self.prompt_bias_table = nn.parameter.Parameter(
+                torch.zeros((2 + self.k) * self.k, config.num_attention_heads)
+            )  # k prompt-to-token, k token-to-prompt, k*k prompt-to-promt
+        else:
+            self.prompt_bias_table = None
+
+    def forward(self) -> torch.Tensor:
+        # relative position bias 2d
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1,
+                -1,
+            )  # Wh*Ww,Wh*Ww,nH
+
+        # add bias for prompts
+        k = self.k
+        if k > 0:
+            l = self.window_size[0] * self.window_size[1] + 1  # noqa: E741
+            bias = torch.zeros(l + k, l + k,
+                               relative_position_bias.shape[-1]).to(
+                                   relative_position_bias.device)
+            bias[:l, :l] = relative_position_bias
+            bias[l:, :l] = self.prompt_bias_table[:k].view(
+                k, 1, -1)  # prompt to token
+            bias[:l,
+                 l:] = self.prompt_bias_table[k:2 *
+                                              k].view(1, k,
+                                                      -1)  # token to prompt
+            bias[l:, l:] = self.prompt_bias_table[2 * k, :].view(
+                k, k, -1)  # prompt to prompt
+        else:
+            bias = relative_position_bias
+
+        return bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+
+class BeitEmbeddings3D(BeitEmbeddings2D):
+    """Construct the CLS token, position and patch embeddings.
+
+    Optionally, also the mask token.
+    """
+
+    def __init__(self, config: BeitConfig) -> None:
+        super().__init__(config)
+
+        if config.use_temporal_position_embedding:
+            self.temporal_position_embeddings = nn.parameter.Parameter(
+                torch.zeros(1, config.num_frames, 1, config.hidden_size))
+        else:
+            self.temporal_position_embeddings = None
+
+        if config.add_k_prompts > 0:
+            self.prompt_tokens = nn.parameter.Parameter(
+                torch.zeros(1, config.add_k_prompts, config.hidden_size))
+        else:
+            self.prompt_tokens = None
+
+    def forward(self,
+                pixel_values: torch.Tensor,
+                bool_masked_pos: Optional[torch.BoolTensor] = None
+                ) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (torch.Tensor): The input image patches.
+                Shape: [B, T, C, H, W].
+
+
+        """
+        t = pixel_values.shape[1]
+        pixel_values = einops.rearrange(pixel_values,
+                                        'b t c h w -> (b t) c h w')
+
+        embeddings = self.patch_embeddings(pixel_values)
+        batch_size, seq_len, _ = embeddings.size()  # [(b t) l c]
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1 - w) + mask_tokens * w
+
+        if self.prompt_tokens is not None:
+            prompt_tokens = self.prompt_tokens.expand(batch_size, -1, -1)
+            embeddings = torch.cat((cls_tokens, embeddings, prompt_tokens),
+                                   dim=1)
+        else:
+            embeddings = torch.cat((cls_tokens, embeddings),
+                                   dim=1)  # [B*T, L, C]
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = einops.rearrange(embeddings, '(b t) l c -> b t l c', t=t)
+        if self.temporal_position_embeddings is not None:
+            if t <= self.temporal_position_embeddings.shape[1]:
+                embeddings = embeddings + \
+                    self.temporal_position_embeddings[:, :t]
+            else:
+                tpe = interpolate_temporal_pos_embed(
+                    self.temporal_position_embeddings, t)
+                embeddings = embeddings + tpe
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class BeitLayer3D(BeitLayer2D):
+
+    def __init__(self,
+                 config: BeitConfig,
+                 window_size: Optional[tuple] = None,
+                 drop_path_rate: float = 0.0) -> None:
+        super().__init__(config, window_size, drop_path_rate)
+
+        self.temporal_model_position = config.temporal_model_position
+        if config.temporal_model_block == 'st_adapter':
+            self.temp_model = STAdapter(**config.temporal_model_config)
+        elif config.temporal_model_block == 'timesformer':
+            self.temp_model = TemporalAttention(**config.temporal_model_config)
+        elif config.temporal_model_block == 'ta_beit':
+            self.temp_model = TemporalAttentionBeit(config)
+        elif config.temporal_model_block == 'window_attention':
+            self.temp_model = WindowTemporalAttention(
+                **config.temporal_model_config)
+        elif config.temporal_model_block == 'xclip':
+            self.temp_model = X_CLIP(**config.temporal_model_config)
+        elif config.temporal_model_block == 'none':
+            self.temp_model = None
+        else:
+            raise ValueError(
+                f'not accepted temporal model: {config.temporal_model_block}')
+
+        self.temporal_model_block = config.temporal_model_block
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        relative_position_bias: Optional['BeitRelativePositionBias'] = None,
+    ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
+
+        b, t, l, c = hidden_states.shape
+
+        if self.temporal_model_block == 'xclip':
+            assert (self.temporal_model_position == 'first'
+                    and self.config.add_k_prompts
+                    == 1), ('xclip must be put before the attention and'
+                            'add_k_prompts must be 1.')
+
+        if self.temp_model is not None and \
+           self.temporal_model_position == 'first':
+            hidden_states = self.temp_model(hidden_states)
+
+        hidden_states = einops.rearrange(hidden_states, 'b t l c -> (b t) l c')
+
+        self_attention_outputs = self.attention(
+            self.layernorm_before(
+                hidden_states
+            ),  # in BEiT, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+            relative_position_bias=relative_position_bias,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # add self attentions if we output attention weights
+        outputs = self_attention_outputs[1:]
+
+        # apply lambda_1 if present
+        if self.lambda_1 is not None:
+            attention_output = self.lambda_1 * attention_output
+
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + hidden_states
+
+        # in BEiT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+
+        layer_output = self.intermediate(layer_output)
+        layer_output = self.output(layer_output)
+
+        if self.lambda_2 is not None:
+            layer_output = self.lambda_2 * layer_output
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        layer_output = einops.rearrange(
+            layer_output, '(b t) l c -> b t l c', b=b)
+
+        # apply temporal modeling block
+        if self.temp_model is not None and \
+           self.temporal_model_position == 'last':
+            layer_output = self.temp_model(layer_output)
+
+        outputs = (layer_output, ) + outputs
+
+        return outputs
+
+
+class BeitConfig3D(BeitConfig):
+
+    def __init__(self,
+                 num_frames=1,
+                 temporal_model_block='none',
+                 temporal_model_position='last',
+                 temporal_model_init_value=0.0,
+                 temporal_model_config={},
+                 use_temporal_position_embedding=False,
+                 add_k_prompts=0,
+                 **kwargs) -> None:
+
+        super().__init__(**kwargs)
+        self.temporal_model_block = temporal_model_block
+        self.temporal_model_config = temporal_model_config
+        self.temporal_model_position = temporal_model_position
+        self.temporal_model_init_value = temporal_model_init_value
+        self.use_temporal_position_embedding = use_temporal_position_embedding
+        self.add_k_prompts = add_k_prompts
+        self.num_frames = num_frames
+
+
+@MODELS.register_module()
+class BeitModel3D(BeitModel):
+
+    def __init__(self,
+                 config: BeitConfig,
+                 tem_config: Dict,
+                 add_pooling_layer: bool = True) -> None:
+        # hack to replace original 2D modules with 3D modules
+        beit_package = importlib.import_module(
+            'transformers.models.beit.modeling_beit')
+        beit_package.BeitEmbeddings = BeitEmbeddings3D
+        beit_package.BeitPooler = BeitPooler3D
+        beit_package.BeitLayer = BeitLayer3D
+        beit_package.BeitRelativePositionBias = BeitRelativePositionBias3D
+
+        config = BeitConfig3D.from_pretrained(config, **tem_config)
+        super().__init__(config, add_pooling_layer)
diff --git a/mmaction/models/multimodal/vindlu/modeling_bert.py b/mmaction/models/multimodal/vindlu/modeling_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..de56451efd0fbd2fbe6c3ba737f462284f0d4745
--- /dev/null
+++ b/mmaction/models/multimodal/vindlu/modeling_bert.py
@@ -0,0 +1,1740 @@
+# flake8: noqa
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from mmengine.logging import MMLogger
+from torch import Tensor, device, dtype, nn
+from torch.nn import CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+# from transformers.models.bert.configuration_bert import BertConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.file_utils import (ModelOutput, add_start_docstrings,
+                                     add_start_docstrings_to_model_forward,
+                                     replace_return_docstrings)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
+    MultipleChoiceModelOutput, NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput, SequenceClassifierOutput,
+    TokenClassifierOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+transformers.logging.set_verbosity_error()
+
+_CONFIG_FOR_DOC = 'BertConfig'
+_TOKENIZER_FOR_DOC = 'BertTokenizer'
+
+BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    'bert-base-uncased',
+    'bert-large-uncased',
+    'bert-base-cased',
+    'bert-large-cased',
+    'bert-base-multilingual-uncased',
+    'bert-base-multilingual-cased',
+    'bert-base-chinese',
+    'bert-base-german-cased',
+    'bert-large-uncased-whole-word-masking',
+    'bert-large-cased-whole-word-masking',
+    'bert-large-uncased-whole-word-masking-finetuned-squad',
+    'bert-large-cased-whole-word-masking-finetuned-squad',
+    'bert-base-cased-finetuned-mrpc',
+    'bert-base-german-dbmdz-cased',
+    'bert-base-german-dbmdz-uncased',
+    'cl-tohoku/bert-base-japanese',
+    'cl-tohoku/bert-base-japanese-whole-word-masking',
+    'cl-tohoku/bert-base-japanese-char',
+    'cl-tohoku/bert-base-japanese-char-whole-word-masking',
+    'TurkuNLP/bert-base-finnish-cased-v1',
+    'TurkuNLP/bert-base-finnish-uncased-v1',
+    'wietsedv/bert-base-dutch-cased',
+    # See all BERT models at https://huggingface.co/models?filter=bert
+]
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BertModel`] or a [`TFBertModel`]. It is used to
+    instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the BERT
+    [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import BertModel, BertConfig
+
+    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> configuration = BertConfig()
+
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = BertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = 'bert'
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act='gelu',
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type='absolute',
+        use_cache=True,
+        classifier_dropout=None,
+        cross_module='ca',
+        encoder_width=768,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.cross_module = cross_module
+        self.encoder_width = encoder_width
+
+
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    logger = MMLogger.get_current_instance()
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see '
+            'https://www.tensorflow.org/install/ for installation instructions.'
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info('Converting TensorFlow checkpoint from {}'.format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info('Loading TF weight {} with shape {}'.format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in [
+                'adam_v',
+                'adam_m',
+                'AdamWeightDecayOptimizer',
+                'AdamWeightDecayOptimizer_1',
+                'global_step',
+        ] for n in name):
+            logger.info('Skipping {}'.format('/'.join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                scope_names = re.split(r'_(\d+)', m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == 'kernel' or scope_names[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif scope_names[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info('Skipping {}'.format('/'.join(name)))
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched'
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+
+        logger.info('Initialize PyTorch weight {}'.format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type
+    embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length:
+                                             seq_length +
+                                             past_key_values_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size /
+                                       config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        if (self.position_embedding_type == 'relative_key'
+                or self.position_embedding_type == 'relative_key_query'):
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if (self.position_embedding_type == 'relative_key'
+                or self.position_embedding_type == 'relative_key_query'):
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = (
+                    attention_scores + relative_position_scores_query +
+                    relative_position_scores_key)
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # added `attention_scores` to return tuple
+        outputs = ((context_layer, attention_probs,
+                    attention_scores) if output_attentions else
+                   (context_layer, ))
+
+        outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+
+        self.self = BertSelfAttention(config, is_cross_attention)
+
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.self.num_attention_heads,
+            self.self.attention_head_size,
+            self.pruned_heads,
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        # add attentions if we output them
+        outputs = (attention_output, ) + self_outputs[1:]
+        return outputs  # (context_layer, attention_probs, attention_scores, past_key_value,)
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+
+        self.has_cross_attention = layer_num >= config.fusion_layer
+        if self.has_cross_attention:
+            self.crossattention = BertAttention(
+                config, is_cross_attention=True)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )  # (context_layer, attention_probs, attention_scores, past_key_value,)
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+
+        if self.has_cross_attention:
+            assert (
+                encoder_hidden_states is not None
+            ), 'encoder_hidden_states must be given for cross-attention layers'
+
+            if type(encoder_hidden_states) == list:
+                cross_attention_outputs = self.crossattention(
+                    attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states[(self.layer_num -
+                                           self.config.fusion_layer) %
+                                          len(encoder_hidden_states)],
+                    encoder_attention_mask[(self.layer_num -
+                                            self.config.fusion_layer) %
+                                           len(encoder_hidden_states)],
+                    output_attentions=output_attentions,
+                )
+                attention_output = cross_attention_outputs[0]
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            else:
+                cross_attention_outputs = self.crossattention(
+                    attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )  # (context_layer, attention_probs, attention_scores, past_key_value,)
+                attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output,
+        )
+        outputs = (layer_output, ) + outputs
+
+        outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config, i) for i in range(config.num_hidden_layers)])
+        logger = MMLogger.get_current_instance()
+        logger.info(f'build bert with cross_module: {config.cross_module}')
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        mode='multi_modal',
+        normalize_attention=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        # all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        if (mode == 'text' or mode == 'temporal'
+            ):  # temporal is added and used for temporal att module.
+            start_layer = 0
+            output_layer = self.config.fusion_layer
+
+        elif mode == 'fusion':
+            start_layer = self.config.fusion_layer
+            output_layer = self.config.num_hidden_layers
+
+        elif mode == 'multi_modal':
+            start_layer = 0
+            output_layer = self.config.num_hidden_layers
+
+        for i in range(start_layer, output_layer):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if getattr(self.config, 'gradient_checkpointing',
+                       False) and self.training:
+
+                if use_cache:
+                    logger = MMLogger.get_current_instance()
+                    logger.warn(
+                        '`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting '
+                        '`use_cache=False`...')
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    use_reentrant=False,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )  # (context_layer, attention_probs, attention_scores, past_key_value,)
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                # whether to output normalized attention,
+                # note for unnormalized attention, there is a mask added
+                offset = int(normalize_attention)
+                # all_self_attentions = all_self_attentions + (layer_outputs[1], )
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[2 - offset], )
+                if hasattr(layer_module, 'crossattention'):
+                    # all_cross_attentions = all_cross_attentions + (layer_outputs[3], )
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[4 - offset], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """An abstract class to handle weights initialization and a simple
+    interface for downloading and loading pretrained models."""
+
+    config_class = BertConfig
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = 'bert'
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """Output type of :class:`~transformers.BertForPreTraining`.
+
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+BERT_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+    Parameters:
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    'The bare Bert Model transformer outputting raw hidden-states without any specific head on top.',
+    BERT_START_DOCSTRING,
+)
+class BertModel(BertPreTrainedModel):
+    """The model can behave as an encoder (with only self-attention) as well as
+    a decoder, in which case a layer of cross-attention is added between the
+    self-attention layers, following the architecture described in `Attention
+    is all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N.
+
+    Gomez, Lukasz Kaiser and Illia Polosukhin. argument and
+    :obj:`add_cross_attention` set to :obj:`True`; an
+    :obj:`encoder_hidden_states` is then expected as an input to the forward
+    pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """Prunes heads of the model.
+
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(self, attention_mask: Tensor,
+                                    input_shape: Tuple[int], device: device,
+                                    is_decoder: bool) -> Tensor:
+        """Makes broadcastable attention and causal masks so that future and
+        masked tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = (
+                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <=
+                    seq_ids[None, :, None])
+                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[
+                        1] - causal_mask.shape[1]
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, seq_length, prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+
+                extended_attention_mask = (
+                    causal_mask[:, None, :, :] *
+                    attention_mask[:, None, None, :])
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                'Wrong shape for input_ids (shape {}) or attention_mask (shape {})'
+                .format(input_shape, attention_mask.shape))
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+        mode='multi_modal',
+        normalize_attention=True,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else
+            self.config.output_attentions)
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = inputs_embeds.device
+        elif encoder_embeds is not None:
+            input_shape = encoder_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = encoder_embeds.device
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds or encoder_embeds'
+            )
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2]
+            if past_key_values is not None else 0)
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device, is_decoder)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
+                    0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+                )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [
+                    self.invert_attention_mask(mask)
+                    for mask in encoder_attention_mask
+                ]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        if encoder_embeds is None:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+        else:
+            embedding_output = encoder_embeds
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            mode=mode,
+            normalize_attention=normalize_attention,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertForPreTraining
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2),
+                next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss, ) +
+                    output) if total_loss is not None else output
+
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top for CLM fine-tuning. """,
+    BERT_START_DOCSTRING,
+)
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=True,
+        reduction='mean',
+        mode='multi_modal',
+        normalize_attention=True,
+        soft_labels=None,
+        alpha=0,
+        return_logits=False,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            mode=mode,
+            normalize_attention=normalize_attention,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :
+                                                          -1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+            lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+        if soft_labels is not None:
+            loss_distill = -torch.sum(
+                F.log_softmax(shifted_prediction_scores, dim=1) * soft_labels,
+                dim=-1)
+            loss_distill = (loss_distill * (labels != -100)).sum(1)
+            lm_loss = (1 - alpha) * lm_loss + alpha * loss_distill
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((lm_loss, ) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'input_ids':
+            input_ids,
+            'attention_mask':
+            attention_mask,
+            'past_key_values':
+            past,
+            'encoder_hidden_states':
+            model_kwargs.get('encoder_hidden_states', None),
+            'encoder_attention_mask':
+            model_kwargs.get('encoder_attention_mask', None),
+            'is_decoder':
+            True,
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx)
+                for past_state in layer_past), )
+        return reordered_past
+
+
+@dataclass
+class MaskedLMOutputWithDistill(MaskedLMOutput):
+    loss_aux: Optional[torch.FloatTensor] = None
+    loss_distill: Optional[torch.FloatTensor] = None
+
+
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top. """,
+    BERT_START_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def tie_aux_decoder_weights(self, module, aux_modules):
+        """Tie decoder weights of all `aux_modules` to `module`, (not bias)"""
+        for m in aux_modules:
+            m.predictions.decoder.weight = module.predictions.decoder.weight
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+        mode='multi_modal',
+        normalize_attention=True,
+        soft_labels=None,
+        alpha=0,
+        return_logits=False,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_embeds=encoder_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            mode=mode,
+            normalize_attention=normalize_attention,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores
+
+        masked_lm_loss = None
+        masked_lm_loss_aux = 0.0
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if soft_labels is not None:
+            loss_distill = -torch.sum(
+                F.log_softmax(prediction_scores, dim=1) * soft_labels, dim=-1)
+            loss_distill = loss_distill[labels != -100].mean()
+            masked_lm_loss = (1 -
+                              alpha) * masked_lm_loss + alpha * loss_distill
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, ) +
+                    output) if masked_lm_loss is not None else output
+
+        # changed from MaskedLMOutput to MaskedLMOutputWithDistill
+        return MaskedLMOutputWithDistill(
+            loss=masked_lm_loss,
+            loss_aux=masked_lm_loss_aux,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert (self.config.pad_token_id
+                is not None), 'The PAD token should be defined for generation'
+        attention_mask = torch.cat([
+            attention_mask,
+            attention_mask.new_zeros((attention_mask.shape[0], 1))
+        ],
+                                   dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1),
+            self.config.pad_token_id,
+            dtype=torch.long,
+            device=input_ids.device,
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
diff --git a/mmaction/models/multimodal/vindlu/temporal_model.py b/mmaction/models/multimodal/vindlu/temporal_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..579d62c524b6ccbf325177b4ed94183275d5ad97
--- /dev/null
+++ b/mmaction/models/multimodal/vindlu/temporal_model.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import einops
+import torch
+from einops import rearrange
+from timm.models.layers import DropPath
+from torch import nn
+from torch.nn import LayerNorm, Linear, MultiheadAttention
+
+
+class STAdapter(nn.Module):
+    """ST Adapter."""
+
+    def __init__(
+        self,
+        kernel_size=(3, 3, 3),
+        input_dim=768,
+        hidden_dim=384,
+        img_size=224,
+        patch_size=16,
+        drop_prob=0.1,
+    ):
+        super(STAdapter, self).__init__()
+        self.kernel_size = kernel_size
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+
+        self.h = self.w = img_size // patch_size
+
+        self.linear1 = nn.Linear(input_dim, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, input_dim)
+        self.act = nn.ReLU()
+        self.conv = nn.Conv3d(
+            hidden_dim,
+            hidden_dim,
+            kernel_size=kernel_size,
+            padding='same',
+            groups=hidden_dim)
+        self.droppath = DropPath(drop_prob=drop_prob)
+
+        self.scale = nn.parameter.Parameter(torch.zeros([]))
+
+    def forward(self, x: torch.Tensor):
+        """forward.
+
+        Args:
+            x (torch.Tensor): input features.
+            Shape: [bs, nframes, l, c]. l = 1 + h*w
+
+        Returns: features after adapter. The same shape as input.
+        """
+        if x.shape[1] == 1:  # for single frame, return itself.
+            return x
+
+        shortcut = x
+        x = self.linear1(x)
+        cls = x[:, :, :1, :]
+        tokens = x[:, :, 1:, :]
+        tokens = einops.rearrange(
+            tokens, 'b t (h w) c -> b c t h w', h=self.h).contiguous()
+        tokens = self.conv(tokens)
+        tokens = einops.rearrange(tokens, 'b c t h w -> b t (h w) c')
+        x = torch.cat([cls, tokens], dim=2)  # [b, t, 1+h*w, c]
+        x = self.act(x)
+        x = self.linear2(x)
+
+        return shortcut + self.scale * self.droppath(x)
+
+
+class TemporalAttention(nn.Module):
+    """perform temporal self-attention."""
+
+    def __init__(self, input_dim=768, droppath_rate=0.1):
+        """
+
+        Kwargs:
+            input_dim (int): The input feature dimension.
+
+
+        """
+        super().__init__()
+
+        self._input_dim = input_dim
+        self.temporal_attn = MultiheadAttention(
+            input_dim, num_heads=input_dim // 64)
+        self.norm = LayerNorm(input_dim, eps=1e-12)
+        self.linear = Linear(input_dim, input_dim)
+        self.droppath = DropPath(droppath_rate)
+        self.scale = nn.parameter.Parameter(torch.zeros([]))
+
+    def forward(self, x: torch.Tensor):
+        """forward.
+
+        Args:
+            x (torch.Tensor): input features.
+            Shape: [bs, nframes, l, c]. l = 1 + h*w
+
+        Returns: features after adapter. The same shape as input.
+        """
+        if x.shape[1] == 1:  # for single frame, return itself.
+            return x
+
+        shortcut = x
+        x = einops.rearrange(x, 'b t l c -> t (b l) c')
+        x = self.norm(x)
+        x = self.temporal_attn(x, x, x)[0]
+        x = einops.rearrange(x, 't (b l) c -> b t l c', b=shortcut.shape[0])
+        return shortcut + self.scale * self.droppath(x)
+
+
+class WindowTemporalAttention(nn.Module):
+    """perform windowed temporal self-attention."""
+
+    def __init__(self, input_dim=768, droppath_rate=0.1, window_size=(2, 2)):
+        """
+
+        Kwargs:
+            input_dim (int): The input feature dimension.
+
+
+        """
+        super().__init__()
+
+        self._input_dim = input_dim
+        self.temporal_attn = MultiheadAttention(
+            input_dim, num_heads=input_dim // 64)
+        self.norm = LayerNorm(input_dim, eps=1e-12)
+        self.droppath = DropPath(droppath_rate)
+        self.scale = nn.parameter.Parameter(torch.zeros([]))
+        self.wh, self.ww = window_size
+
+    def forward(self, x: torch.Tensor):
+        """forward.
+
+        Args:
+            x (torch.Tensor): input features.
+            Shape: [bs, nframes, l, c]. l = 1 + h*w
+
+        Returns: features after adapter. The same shape as input.
+        """
+        if x.shape[1] == 1:  # for single frame, return itself.
+            return x
+        shortcut = x
+
+        h = w = int(math.sqrt(x.shape[2] - 1))
+        cls_token = x[:, :, :1, :]
+        x = einops.rearrange(
+            x[:, :, 1:, :],
+            'b t (nh wh nw ww) c -> (t wh ww) (b nh nw) c',
+            nh=h // self.wh,
+            wh=self.wh,
+            nw=w // self.ww,
+            ww=self.ww,
+        )
+        x = self.norm(x)
+        x = self.temporal_attn(x, x, x)[0]
+        x = einops.rearrange(
+            x,
+            '(t wh ww) (b nh nw) c -> b t (nh wh nw ww) c',
+            wh=self.wh,
+            ww=self.ww,
+            nh=h // self.wh,
+            nw=w // self.ww,
+        )
+        # add back cls token.
+        x = torch.concat([cls_token, x], dim=2)
+        return shortcut + self.scale * self.droppath(x)
+
+
+class X_CLIP(nn.Module):
+    """perform windowed temporal self-attention."""
+
+    def __init__(self, input_dim=768, droppath_rate=0.1, num_prompts=1):
+        """
+
+        Kwargs:
+            input_dim (int): The input feature dimension.
+
+
+        """
+        super().__init__()
+
+        d_model = input_dim
+
+        self.message_fc = nn.Linear(d_model, d_model)
+        self.message_ln = LayerNorm(d_model, eps=1e-12)
+        self.message_attn = nn.MultiheadAttention(d_model, d_model // 64)
+        self.num_prompts = num_prompts
+
+        self.droppath = DropPath(droppath_rate)
+
+    def forward(self, x: torch.Tensor):
+        """forward.
+
+        Args:
+            x (torch.Tensor): input features.
+            Shape: [bs, nframes, l, c]. l = 1 + h*w
+
+        Returns: features after adapter. The same shape as input.
+        """
+        if x.shape[1] == 1:  # for single frame, return itself.
+            return x
+        msg_token = self.message_ln(self.message_fc(x[:, :,
+                                                      0, :]))  # [b, t, c]
+        msg_token = rearrange(msg_token, 'b t c -> t b c')
+        msg_token = msg_token + self.droppath(
+            self.message_attn(msg_token, msg_token, msg_token)[0])
+        msg_token = rearrange(msg_token, 't b c -> b t c')
+        # replace the last prompt token with msg_token.
+        x = torch.cat([x[:, :, :-1, :],
+                       msg_token.unsqueeze(2)], dim=2)  # [b, t, l+1, c]
+        return x
diff --git a/mmaction/models/multimodal/vindlu/tokenizer.py b/mmaction/models/multimodal/vindlu/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8897c4e79c71d1e2da596030b3c638e85068edb
--- /dev/null
+++ b/mmaction/models/multimodal/vindlu/tokenizer.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+from transformers import BertTokenizer
+
+from mmaction.registry import TOKENIZER
+
+
+class VindLUTokenizer(BertTokenizer):
+    """VindLUTokenizer inherit BertTokenizer.
+
+    The main difference from BertTokenizer is removing the last separate token
+    for a single sequence.
+    """
+
+    def build_inputs_with_special_tokens(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """Build model inputs from a sequence or a pair of sequence for
+        sequence classification tasks by concatenating and adding special
+        tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with
+            the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+
+TOKENIZER.register_module(
+    'VindLUTokenizer', module=VindLUTokenizer.from_pretrained)
diff --git a/mmaction/models/multimodal/vindlu/utils.py b/mmaction/models/multimodal/vindlu/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9116b7d9a273f3992b3e0edc418876648795d24c
--- /dev/null
+++ b/mmaction/models/multimodal/vindlu/utils.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.dist as dist
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmengine.logging import MMLogger
+from scipy import interpolate
+
+
+def all_gather_concat(data: torch.Tensor) -> torch.Tensor:
+    """Gather tensors with different first-dimension size and concat to one
+    tenosr.
+
+    Note:
+        Only the first dimension should be different.
+
+    Args:
+        data (Tensor): Tensor to be gathered.
+
+    Returns:
+        torch.Tensor: The concatenated tenosr.
+    """
+    if dist.get_world_size() == 1:
+        return data
+
+    data_size = torch.tensor(data.size(0), device=data.device)
+    sizes_list = dist.all_gather(data_size)
+
+    total_length = sum(sizes_list)
+    max_length = max(sizes_list)
+    size_diff = max_length.item() - data_size.item()
+    if size_diff:
+        padding = torch.zeros(
+            size_diff, *data.size()[1:], device=data.device, dtype=data.dtype)
+        data = torch.cat((data, padding))
+
+    gather_list = dist.all_gather(data)
+
+    # gather all data according to the default DDP sampler. For instance,
+    # 8 samples on 2 GPUs, GPU0: [0,2,4,6], GPU1: [1,3,5,7], will be gathered
+    # as [0,1,2,3,4,5,6,7]
+    all_data = []
+    for gather_batch in zip(*gather_list):
+        all_data.extend(gather_batch)
+
+    return torch.stack(all_data)[:total_length]
+
+
+def interpolate_pos_embed_beit(state_dict, new_model):
+    """interpolate the positional embeddings. The spatial pe is relative and
+    temporal pe is absolute. additional temporal pe is padded with 0.
+
+    Args:
+        state_dict (dict): The state_dict.
+        new_model (nn.Module): The created model.
+
+    Returns: dict. The state_dict with updated positional embeddings.
+    """
+    state_dict = interpolate_pos_relative_bias_beit(
+        state_dict_old=state_dict,
+        state_dict_new=new_model.state_dict(),
+        patch_shape_new=new_model.vision_encoder.embeddings.patch_embeddings.
+        patch_shape,
+    )
+    # absolute temporal pos bias
+    temporal_pe_key = 'vision_encoder.embeddings.temporal_position_embeddings'
+    if temporal_pe_key in state_dict:
+        logger = MMLogger.get_current_instance()
+        logger.info(
+            f'interpolate temporal positional embeddings: {temporal_pe_key}')
+        state_dict[temporal_pe_key] = load_temp_embed_with_mismatch(
+            temp_embed_old=state_dict[temporal_pe_key],
+            temp_embed_new=new_model.state_dict()[temporal_pe_key],
+        )
+    return state_dict
+
+
+def load_temp_embed_with_mismatch(temp_embed_old,
+                                  temp_embed_new,
+                                  add_zero=True):
+    """Add/Remove extra temporal_embeddings as needed.
+    https://arxiv.org/abs/2104.00650 shows adding zero paddings works.
+
+    temp_embed_old: (1, num_frames_old, 1, d)
+    temp_embed_new: (1, num_frames_new, 1, d)
+    add_zero: bool, if True, add zero, else, interpolate trained embeddings.
+    """
+    # TODO zero pad
+    num_frms_new = temp_embed_new.shape[1]
+    num_frms_old = temp_embed_old.shape[1]
+    logger = MMLogger.get_current_instance()
+    logger.info(
+        f'Load temporal_embeddings, lengths: {num_frms_old}-->{num_frms_new}')
+    if num_frms_new > num_frms_old:
+        if add_zero:
+            temp_embed_new[:, :num_frms_old] \
+                = temp_embed_old  # untrained embeddings are zeros.
+        else:
+            temp_embed_new = interpolate_temporal_pos_embed(
+                temp_embed_old, num_frms_new)
+    elif num_frms_new < num_frms_old:
+        temp_embed_new = temp_embed_old[:, :num_frms_new]
+    else:  # =
+        temp_embed_new = temp_embed_old
+    return temp_embed_new
+
+
+def interpolate_temporal_pos_embed(temp_embed_old, num_frames_new):
+    """
+    temp_embed_old: (1, num_frames_old, 1, d)
+    Returns:
+        temp_embed_new: (1, num_frames_new, 1, d)
+    """
+    temp_embed_old = temp_embed_old.squeeze(2).permute(
+        0, 2, 1)  # (1, d, num_frames_old)
+    temp_embed_new = F.interpolate(
+        temp_embed_old, num_frames_new,
+        mode='linear')  # (1, d, num_frames_new)
+    temp_embed_new = temp_embed_new.permute(0, 2, 1).unsqueeze(
+        2)  # (1, num_frames_new, 1, d)
+    return temp_embed_new
+
+
+def interpolate_pos_relative_bias_beit(state_dict_old, state_dict_new,
+                                       patch_shape_new):
+    """
+    Args:
+        state_dict_old: loaded state dict
+        state_dict_new: state dict for model with new image size
+        patch_shape_new: new model patch_shape
+    ref: https://github.com/microsoft/unilm/blob/master/beit/run_class_finetuning.py  # noqa: E501
+    """
+    all_keys = list(state_dict_old.keys())
+    for key in all_keys:
+        if 'relative_position_index' in key:
+            state_dict_old.pop(key)
+
+        if 'relative_position_bias_table' in key:
+            rel_pos_bias = state_dict_old[key]
+            src_num_pos, num_attn_heads = rel_pos_bias.size()
+            dst_num_pos, _ = state_dict_new[key].size()
+            dst_patch_shape = patch_shape_new
+            if dst_patch_shape[0] != dst_patch_shape[1]:
+                raise NotImplementedError()
+            num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * (
+                dst_patch_shape[1] * 2 - 1)
+            src_size = int((src_num_pos - num_extra_tokens)**0.5)
+            dst_size = int((dst_num_pos - num_extra_tokens)**0.5)
+            if src_size != dst_size:
+                extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
+                rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
+
+                def geometric_progression(a, r, n):
+                    return a * (1.0 - r**n) / (1.0 - r)
+
+                left, right = 1.01, 1.5
+                while right - left > 1e-6:
+                    q = (left + right) / 2.0
+                    gp = geometric_progression(1, q, src_size // 2)
+                    if gp > dst_size // 2:
+                        right = q
+                    else:
+                        left = q
+
+                dis = []
+                cur = 1
+                for i in range(src_size // 2):
+                    dis.append(cur)
+                    cur += q**(i + 1)
+
+                r_ids = [-_ for _ in reversed(dis)]
+
+                x = r_ids + [0] + dis
+                y = r_ids + [0] + dis
+
+                t = dst_size // 2.0
+                dx = np.arange(-t, t + 0.1, 1.0)
+                dy = np.arange(-t, t + 0.1, 1.0)
+
+                all_rel_pos_bias = []
+
+                for i in range(num_attn_heads):
+                    z = rel_pos_bias[:, i].view(src_size,
+                                                src_size).float().numpy()
+                    f = interpolate.interp2d(x, y, z, kind='cubic')
+                    all_rel_pos_bias.append(
+                        torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(
+                            rel_pos_bias.device))
+
+                rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
+
+                new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens),
+                                             dim=0)
+                state_dict_old[key] = new_rel_pos_bias
+    return state_dict_old
diff --git a/mmaction/models/multimodal/vindlu/vindlu.py b/mmaction/models/multimodal/vindlu/vindlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc7eaf88261765ad159eb8d8cfaec649c7398321
--- /dev/null
+++ b/mmaction/models/multimodal/vindlu/vindlu.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Optional
+
+import torch
+from mmengine.logging import MMLogger
+from mmengine.model import BaseModel
+from mmengine.runner.checkpoint import _load_checkpoint
+from torch import nn
+
+from mmaction.registry import MODELS, TOKENIZER
+from mmaction.utils import ForwardResults, SampleList
+from .utils import (interpolate_pos_embed_beit,
+                    interpolate_pos_relative_bias_beit)
+
+
+class VindLUBase(BaseModel):
+    """VindLU base Model.
+
+    Args:
+        tokenizer: (dict): The config for tokenizer.
+        vision_encoder (dict): Backbone for extracting image features.
+        text_encoder (dict): Backbone for extracting text features.
+        temperature (float): Temperature parameter that controls the
+            concentration level of the distribution. Defaults to 0.07.
+        gradient_checkpointing (bool): Whether to do gradient_checkpointing.
+            Using checkpoint will save some memory while slowing down the
+            training speed. Defaults to False.
+        data_preprocessor (Optional[dict]): The config for preprocessing input
+            data.
+        init_cfg (Optional[dict]): the config to control the initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        tokenizer: dict,
+        vision_encoder: dict,
+        text_encoder: dict,
+        proj_dim: int = 256,
+        temperature: float = 0.07,
+        gradient_checkpointing: bool = False,
+        pretrined_vl: bool = True,
+        data_preprocessor: Optional[dict] = None,
+        init_cfg: Optional[dict] = None,
+    ):
+        if data_preprocessor is None:
+            data_preprocessor = dict(type='ActionDataPreprocessor')
+        super().__init__(
+            init_cfg=init_cfg, data_preprocessor=data_preprocessor)
+
+        self.tokenizer = TOKENIZER.build(tokenizer)
+        self.vision_cfg = vision_encoder
+        self.text_encoder_cfg = text_encoder
+        self.gradient_checkpointing = gradient_checkpointing
+        self.text_encoder_cfg.gradient_checkpointing = gradient_checkpointing
+
+        self.vision_width = vision_encoder.pop('encoder_width')
+        self.text_width = text_encoder.encoder_width
+        self.pretrined_vl = pretrined_vl
+
+        if self.vision_cfg.pop('add_ln'):
+            self.vision_layernorm = nn.LayerNorm(self.vision_width, eps=1e-12)
+        else:
+            self.vision_layernorm = nn.Identity()
+
+        self.vision_encoder = MODELS.build(self.vision_cfg)
+
+        if gradient_checkpointing:
+            self.vision_encoder.gradient_checkpointing_enable()
+
+        self.text_encoder = MODELS.build(self.text_encoder_cfg)
+
+        self.vision_proj = nn.Linear(self.vision_width, proj_dim)
+        self.text_proj = nn.Linear(self.text_width, proj_dim)
+
+        self.temp = nn.parameter.Parameter(torch.ones([]) * temperature)
+        self.itm_head = nn.Linear(self.text_width, 2)
+
+    def extract_feat(self, inputs: torch.Tensor, **kwargs) -> ForwardResults:
+        """Extract features from raw inputs."""
+
+    @abstractmethod
+    def loss(self, inputs: torch.Tensor, data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+
+    def forward(self, inputs, data_samples, mode: str = 'loss'):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes:
+
+        - ``tensor``: Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - ``predict``: Forward and return the predictions, which are fully
+        processed to a list of :obj:`ActionDataSample`.
+        - ``loss``: Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape
+                (N, C, ...) in general.
+            data_samples (List[``ActionDataSample], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to ``tensor``.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of ``ActionDataSample``.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+
+        if mode == 'tensor':
+            return self.extract_feat(inputs, data_samples)
+        elif mode == 'loss':
+            return self.loss(inputs, data_samples)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}".')
+
+    def encode_vision(self, image):
+        """encode image / videos as features.
+
+        Args:
+            image (torch.Tensor): The input images.
+
+        Returns: tuple.
+            - vision_embeds (torch.Tensor): The features of all patches.
+                Shape: [B,T,L,C].
+            - pooled_vision_embeds (torch.Tensor): The pooled features.
+                Shape: [B,T,C].
+        """
+        output_dict = self.vision_encoder(image)
+        vision_embeds = self.vision_layernorm(output_dict.last_hidden_state)
+        pooled_vision_embeds = output_dict.pooler_output
+
+        return vision_embeds, pooled_vision_embeds
+
+    def encode_text(self, text):
+        """encode text.
+        Args:
+            text (dict): The output of huggingface's `PreTrainedTokenizer`.
+                contains keys:
+                - input_ids (torch.Tensor): Token ids to be fed to a model.
+                    Shape: [B,L].
+                - attention_mask (torch.Tensor): The mask indicate padded tokens.
+                    Shape: [B,L]. 0 is padded token.
+                - other keys refer to "https://huggingface.co/docs/transformers/v4.21.2/en/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__".  # noqa: E501
+        Returns: tuple.
+            - text_embeds (torch.Tensor): The features of all tokens. Shape: [B,L,C].
+            - pooled_text_embeds (torch.Tensor): The pooled features. Shape: [B,C].
+
+        """
+        text_output = self.text_encoder(
+            text.input_ids,
+            attention_mask=text.attention_mask,
+            return_dict=True,
+            mode='text',
+        )
+        text_embeds = text_output.last_hidden_state
+        pooled_text_embeds = text_embeds[:, 0]
+        return text_embeds, pooled_text_embeds
+
+    @torch.no_grad()
+    def clip_contrastive_temperature(self, min_val=0.001, max_val=0.5):
+        """Seems only used during pre-training."""
+        self.temp.clamp_(min_val, max_val)
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def preprocess_state_dict(self, state_dict):
+        """Preprocess pretrained checkpoint for text_encoder."""
+        for key in list(state_dict.keys()):
+            if 'bert' in key:
+                encoder_key = key.replace('bert.', '')
+                state_dict[encoder_key] = state_dict[key]
+                del state_dict[key]
+        return state_dict
+
+    def load_from_pretrainded_beit(self):
+        from transformers.models.beit.modeling_beit import BeitModel
+        beit2d = BeitModel.from_pretrained(
+            self.vision_cfg.pretrained_model_name_or_path)
+        ori_state_dict = beit2d.state_dict()
+        del beit2d
+        # interpolate relative pos bias
+        state_dict = interpolate_pos_relative_bias_beit(
+            state_dict_old=ori_state_dict,
+            state_dict_new=self.vision_encoder.state_dict(),
+            patch_shape_new=self.vision_encoder.embeddings.patch_embeddings.
+            patch_shape,
+        )
+
+        for k in list(state_dict.keys()):
+            if 'prompt_bias_table' in k:
+                del state_dict[k]
+
+        msg = self.vision_encoder.load_state_dict(state_dict, strict=False)
+        logger = MMLogger.get_current_instance()
+        logger.info(msg)
+
+    def init_weights(self):
+        if self.vision_cfg.get('pretrained2d', False):
+            self.load_from_pretrainded_beit()
+
+        if self.pretrined_vl:
+            assert self.init_cfg.get('type') == 'Pretrained', (
+                'Please specify '
+                'init_cfg to use pretrained video-language checkpoint')
+            self.pretrained = self.init_cfg.get('checkpoint')
+            checkpoint = _load_checkpoint(self.pretrained, map_location='cpu')
+            state_dict = checkpoint['model']
+            state_dict = interpolate_pos_embed_beit(state_dict, self)
+            state_dict = self.preprocess_state_dict(state_dict)
+            msg = self.load_state_dict(state_dict, strict=False)
+            logger = MMLogger.get_current_instance()
+            logger.info(msg)
+        else:
+            super().init_weights()
diff --git a/mmaction/models/multimodal/vindlu/vindlu_ret.py b/mmaction/models/multimodal/vindlu/vindlu_ret.py
new file mode 100644
index 0000000000000000000000000000000000000000..da65951e423019dff14e3e50c38e718a542c69c6
--- /dev/null
+++ b/mmaction/models/multimodal/vindlu/vindlu_ret.py
@@ -0,0 +1,464 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional
+
+import mmengine.dist as dist
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch.distributed.nn import all_gather as all_gather_with_grad
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.utils import track_on_main_process
+from .utils import all_gather_concat
+from .vindlu import VindLUBase
+
+
+@MODELS.register_module()
+class VindLURetrieval(VindLUBase):
+    """VindLU retriever.
+
+    max_txt_len (int): Max text length of input text, used for retrieval
+        from multiple choices. Defaults to 32.
+    topk (int): Select topk similarity as candidates for compute matching
+        scores. Defaults to 256.
+    negative_all_rank (bool): Whether to sample negative data from all
+        ranks for image text matching in training. Defaults to False.
+    fast_match (bool): If False, select topk similarity as candidates and
+            compute the matching score. If True, return the similarity as the
+            matching score directly. Defaults to False.
+    **kwargs: Other keyword arguments to initialize the VindLU base model.
+    """
+
+    def __init__(self,
+                 max_txt_len: int = 32,
+                 topk: int = 128,
+                 negative_all_rank: bool = False,
+                 fast_match: bool = False,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.max_txt_len = max_txt_len
+        self.topk = topk
+        self.negative_all_rank = negative_all_rank
+        self.fast_match = fast_match
+
+    def loss(
+        self,
+        inputs: torch.Tensor,
+        data_samples: Optional[List[ActionDataSample]] = None,
+    ) -> Dict[str, torch.tensor]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (dict): A batch of inputs. The input tensor with of
+                at least one modality. For image, the value is a tensor
+                of shape (N, C, ...) in general.
+                For text, the value is a dict of tokenized text inputs.
+            data_samples (Optional[List[DataSample]]):
+                The annotation data of every samples. Defaults to None.
+
+        Returns:
+            Dict[str, torch.tensor]: a dictionary of loss components of
+        """
+        output = self.extract_feat(inputs, data_samples)
+
+        text_embeds = output['text_embeds']
+        text_attn_mask = output['text_attn_mask']
+        image_embeds = output['image_embeds']
+        image_feat = output['image_feat']
+        text_feat = output['text_feat']
+
+        image_atts = torch.ones(
+            image_embeds.size()[:-1], dtype=torch.long).to(self.device)
+
+        # ITC Loss
+        # B*world_size, D
+        image_feat_all = torch.cat(dist.all_gather(image_feat))
+        # B*world_size, D
+        text_feat_all = torch.cat(dist.all_gather(text_feat))
+
+        # image to text similarity
+        # B, B*world_size
+        sim_i2t = torch.einsum('mld,nd->mln', image_feat,
+                               text_feat_all).mean(1) / self.temp
+        # text-image similarity
+        # B, B*world_size
+        sim_t2i = torch.einsum('md,nld->mln', text_feat,
+                               image_feat_all).mean(1) / self.temp
+
+        rank = dist.get_rank()
+        bs = inputs.size(0)
+        itc_targets = torch.linspace(
+            rank * bs, rank * bs + bs - 1, bs, dtype=int).to(self.device)
+
+        itc_loss = (F.cross_entropy(sim_i2t, itc_targets) +
+                    F.cross_entropy(sim_t2i, itc_targets)) / 2
+
+        # prepare for itm
+        output_pos = self.text_encoder(
+            encoder_embeds=text_embeds,
+            attention_mask=text_attn_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=True,
+            mode='fusion',
+        )
+
+        idx = torch.tensor([i.gt_video_id for i in data_samples]).view(-1, 1)
+        bs = idx.size(0)
+        if self.negative_all_rank:
+            idxs = torch.cat(dist.all_gather(idx))
+            image_feat_world = torch.cat(dist.all_gather(image_feat))
+            text_feat_world = torch.cat(dist.all_gather(text_feat))
+            att_mask_world = torch.cat(dist.all_gather(text_attn_mask))
+            text_embeds_world = torch.cat(all_gather_with_grad(text_embeds))
+            image_embeds_world = torch.cat(all_gather_with_grad(image_embeds))
+        else:
+            idxs = idx
+            image_feat_world = image_feat.detach()
+            text_feat_world = text_feat.detach()
+            image_embeds_world = image_embeds
+            text_embeds_world = text_embeds
+            att_mask_world = text_attn_mask
+
+        with torch.no_grad():
+            # compute sample similarity
+            sim_i2t = torch.einsum('mld,nd->mln', image_feat,
+                                   text_feat_world).mean(1) / self.temp
+            sim_t2i = torch.einsum('md,nld->mln', text_feat,
+                                   image_feat_world).mean(1) / self.temp
+
+            mask = torch.eq(idx, idxs.t()).to(self.device)
+            weights_i2t = F.softmax(sim_i2t + 1e-4, dim=1)
+            weights_i2t.masked_fill_(mask, 0)
+
+            weights_t2i = F.softmax(sim_t2i + 1e-4, dim=1)
+            weights_t2i.masked_fill_(mask, 0)
+
+        # select a negative image for each text
+        neg_idx = torch.multinomial(weights_t2i, 1).squeeze()
+        image_embeds_neg = image_embeds_world[neg_idx]
+
+        # select a negative text for each image
+        neg_idx = torch.multinomial(weights_i2t, 1).squeeze()
+        text_embeds_neg = text_embeds_world[neg_idx]
+        text_atts_neg = att_mask_world[neg_idx]
+
+        text_embeds_all = torch.cat([text_embeds, text_embeds_neg], dim=0)
+        text_atts_all = torch.cat([text_attn_mask, text_atts_neg], dim=0)
+
+        image_embeds_all = torch.cat([image_embeds_neg, image_embeds], dim=0)
+        image_atts_all = torch.cat([image_atts, image_atts], dim=0)
+
+        output_neg = self.text_encoder(
+            encoder_embeds=text_embeds_all,
+            attention_mask=text_atts_all,
+            encoder_hidden_states=image_embeds_all,
+            encoder_attention_mask=image_atts_all,
+            return_dict=True,
+            mode='fusion',
+        )
+
+        vl_embeddings = torch.cat(
+            [
+                output_pos.last_hidden_state[:, 0, :],
+                output_neg.last_hidden_state[:, 0, :],
+            ],
+            dim=0,
+        )
+
+        itm_targets = torch.ones((3 * bs, ),
+                                 dtype=torch.long,
+                                 device=inputs.device)
+        itm_targets[bs:] = 0
+        itm_logit = self.itm_head(vl_embeddings)
+        itm_loss = F.cross_entropy(itm_logit, itm_targets)
+
+        return dict(itc_loss=itc_loss, itm_loss=itm_loss)
+
+    def preprocess_text(self, data_samples):
+        sample_item = data_samples[0]
+
+        if sample_item is not None and 'text' in sample_item:
+            if isinstance(sample_item.get('text'), (list, tuple)):
+                texts = []
+                for sample in data_samples:
+                    texts.extend(sample.get('text'))
+            elif isinstance(sample_item.get('text'), str):
+                texts = [sample.get('text') for sample in data_samples]
+            else:
+                raise TypeError('text must be a string or a list of strings')
+        else:
+            return None
+
+        # perform tokenize first if satisfied conditions
+        texts = self.tokenizer(
+            texts,
+            padding='max_length',
+            truncation=True,
+            max_length=self.max_txt_len,
+            return_tensors='pt',
+        ).to(self.device)
+
+        return texts
+
+    def extract_feat(
+        self,
+        images: torch.Tensor = None,
+        data_samples: List[ActionDataSample] = None,
+        return_texts=True,
+    ) -> Dict[str, torch.Tensor]:
+        """Extract features from the input dict.
+
+        Args:
+            images (tensor, optional): The images to extract features.
+                Defaults to None.
+            data_samples (list, optional): The data samples containing texts
+                to extract features. Defaults to None.
+            return_texts (bool): Whether to return the tokenized text and the
+                corresponding attention masks. Defaults to True.
+
+        Returns:
+            Tuple[torch.Tensor]: The output features.
+                If multimodal_backbone is not exist, tuple of torch.Tensor
+                will be returned.
+        """
+        if data_samples is not None:
+            texts = self.preprocess_text(data_samples)
+        else:
+            texts = None
+
+        assert images is not None or texts is not None, \
+            'At least single modality should be passed as inputs.'
+
+        results = {}
+        if texts is not None and return_texts:
+            results.update({
+                'text_ids': texts.input_ids,
+                'text_attn_mask': texts.attention_mask,
+            })
+
+        # extract image features
+        if images is not None:
+            image_embeds, pooled_image_embeds = self.encode_vision(images)
+            # concat temporal embeds
+            image_embeds = rearrange(image_embeds,
+                                     'b t l c -> b (t l) c').contiguous()
+            results['image_embeds'] = image_embeds
+            results['image_feat'] = F.normalize(
+                self.vision_proj(pooled_image_embeds), dim=-1)
+
+        # extract text features
+        if texts is not None:
+            texts_output = self.text_encoder(
+                texts.input_ids,
+                attention_mask=texts.attention_mask,
+                return_dict=True,
+                mode='text')
+
+            text_embeds = texts_output.last_hidden_state
+            pooled_text_feat = text_embeds[:, 0]
+            results['text_embeds'] = text_embeds
+            results['text_feat'] = F.normalize(
+                self.text_proj(pooled_text_feat), dim=-1)
+
+        return results
+
+    def predict(self, images, data_samples, cal_i2t=True, cal_t2i=True):
+        feats = self.extract_feat(images, data_samples)
+
+        return self.predict_all(
+            feats, data_samples, cal_i2t=cal_i2t, cal_t2i=cal_t2i)
+
+    def predict_all(self,
+                    feats,
+                    data_samples,
+                    num_images=None,
+                    num_texts=None,
+                    cal_i2t=True,
+                    cal_t2i=True):
+        text_attn_mask = feats['text_attn_mask']
+        image_embeds = feats.get('image_embeds', None)
+        image_feat = feats['image_feat']
+        text_embeds = feats['text_embeds']
+        text_feat = feats['text_feat']
+
+        num_images = num_images or image_feat.size(0)
+        num_texts = num_texts or text_feat.size(0)
+
+        image_embeds_all = all_gather_concat(image_embeds)[:num_images]
+        image_feat_all = all_gather_concat(image_feat)[:num_images]
+        text_feat_all = all_gather_concat(text_feat)[:num_texts]
+        text_embeds_all = all_gather_concat(text_embeds)[:num_texts]
+        text_attn_mask_all = all_gather_concat(text_attn_mask)[:num_texts]
+
+        results = []
+        if cal_i2t:
+            result_i2t = self.compute_score_matrix_i2t(
+                image_feat,
+                image_embeds,
+                text_feat_all,
+                text_embeds_all,
+                text_attn_mask_all,
+            )
+            results.append(
+                self._get_predictions(result_i2t, data_samples, mode='i2t'))
+        if cal_t2i:
+            result_t2i = self.compute_score_matrix_t2i(
+                image_feat_all,
+                image_embeds_all,
+                text_feat,
+                text_embeds,
+                text_attn_mask,
+            )
+            results.append(
+                self._get_predictions(result_t2i, data_samples, mode='t2i'))
+        return tuple(results)
+
+    def compute_score_matrix_i2t(self, img_feats, img_embeds, text_feats,
+                                 text_embeds, text_atts):
+        """Compare the score matrix for image-to-text retrieval. Every image
+        should compare to all the text features.
+
+        Args:
+            img_feats (torch.Tensor): The input img feats tensor with shape
+                (M, C). M stands for numbers of samples on a single GPU.
+            img_embeds (torch.Tensor): The input img embeds tensor with shape
+                (M, C). M stands for numbers of samples on a single GPU.
+            text_feats (torch.Tensor): The input text feats tensor with shape
+                (N, C). N stands for numbers of all samples on all GPUs.
+            text_embeds (torch.Tensor): The input tensor with shape (N, C).
+            text_atts (torch.Tensor): The input tensor with shape (N, C).
+
+        Returns:
+            torch.Tensor: Score matrix of image-to-text retrieval.
+        """
+        # compute i2t sim matrix
+        sim_matrix_i2t = torch.einsum('mld,nd->mln', img_feats,
+                                      text_feats).mean(1)
+        if self.fast_match:
+            return sim_matrix_i2t
+
+        score_matrix_i2t = torch.full((img_feats.size(0), text_feats.size(0)),
+                                      -100.0).to(self.device)
+        for i in track_on_main_process(
+                range(img_feats.size(0)), 'Compute I2T scores...'):
+            sims = sim_matrix_i2t[i]
+            topk_sim, topk_idx = sims.topk(k=self.topk, dim=0)
+            topk_bz = 32
+            encoder_output = img_embeds[i].repeat(topk_bz, 1, 1)
+            encoder_att = torch.ones(
+                encoder_output.size()[:-1], dtype=torch.long).to(self.device)
+            for j in range(0, self.topk // topk_bz):
+                batch_topk = topk_idx[j * topk_bz:(j + 1) * topk_bz]
+                output = self.text_encoder(
+                    encoder_embeds=text_embeds[batch_topk],
+                    attention_mask=text_atts[batch_topk],
+                    encoder_hidden_states=encoder_output,
+                    encoder_attention_mask=encoder_att,
+                    return_dict=True,
+                    mode='fusion')
+                score = self.itm_head(output.last_hidden_state[:, 0, :])[:, 1]
+                score_matrix_i2t[i, batch_topk] = score
+        return score_matrix_i2t
+
+    def compute_score_matrix_t2i(self, img_feats, img_embeds, text_feats,
+                                 text_embeds, text_atts):
+        """Compare the score matrix for text-to-image retrieval. Every text
+        should compare to all the image features.
+
+        Args:
+            img_feats (torch.Tensor): The input img feats tensor with shape
+                (M, C). M stands for numbers of samples on a single GPU.
+            img_embeds (torch.Tensor): The input img embeds tensor with shape
+                (M, C). M stands for numbers of samples on a single GPU.
+            text_feats (torch.Tensor): The input text feats tensor with shape
+                (N, C). N stands for numbers of all samples on all GPUs.
+            text_embeds (torch.Tensor): The input tensor with shape (M, C).
+            text_atts (torch.Tensor): The input tensor with shape (M, C).
+
+        Returns:
+            torch.Tensor: Score matrix of text-to-image retrieval.
+        """
+        # compute t2i sim matrix
+        sim_matrix_t2i = torch.einsum('md,nld->mln', text_feats,
+                                      img_feats).mean(1)
+
+        if self.fast_match:
+            return sim_matrix_t2i
+
+        score_matrix_t2i = torch.full((text_feats.size(0), img_feats.size(0)),
+                                      -100.0).to(self.device)
+        for i in track_on_main_process(
+                range(text_feats.size(0)), 'Compute T2I scores...'):
+            sims = sim_matrix_t2i[i]
+            topk_sim, topk_idx = sims.topk(k=self.topk, dim=0)
+            topk_bz = 32
+            for j in range(0, self.topk // topk_bz):
+                batch_topk = topk_idx[j * topk_bz:(j + 1) * topk_bz]
+                encoder_output = img_embeds[batch_topk]
+                encoder_att = torch.ones(
+                    encoder_output.size()[:-1],
+                    dtype=torch.long).to(self.device)
+                output = self.text_encoder(
+                    encoder_embeds=text_embeds[i].repeat(topk_bz, 1, 1),
+                    attention_mask=text_atts[i].repeat(topk_bz, 1),
+                    encoder_hidden_states=encoder_output,
+                    encoder_attention_mask=encoder_att,
+                    return_dict=True,
+                    mode='fusion')
+                score = self.itm_head(output.last_hidden_state[:, 0, :])[:, 1]
+                score_matrix_t2i[i, batch_topk] = score
+        return score_matrix_t2i
+
+    def _get_predictions(self,
+                         result: torch.Tensor,
+                         data_samples: List[ActionDataSample],
+                         mode: str = 'i2t'):
+        """Post-process the output of retriever.
+
+        Args:
+            result (torch.Tensor): Score matrix of single retrieve,
+                either from image or text.
+            data_samples (List[ActionDataSample], optional): The annotation
+                data of every samples.
+            mode (str): Retrieve mode, either `i2t` for image to text, or `t2i`
+                text to image. Defaults to `i2t`.
+
+        Returns:
+            List[ActionDataSample]: the raw data_samples with
+                the predicted results.
+        """
+
+        # create data sample if not exists
+        if data_samples is None:
+            data_samples = [ActionDataSample() for _ in range(result.size(0))]
+        elif mode == 't2i':
+            # Process data samples to align with the num of texts.
+            new_data_samples = []
+            for sample in data_samples:
+                if isinstance(sample.text, (list, tuple)):
+                    texts = sample.text
+                else:
+                    texts = [sample.text]
+                for i, text in enumerate(texts):
+                    new_sample = ActionDataSample(text=text)
+                    if 'gt_video_id' in sample:
+                        new_sample.gt_label = sample.gt_video_id[i]
+                    new_data_samples.append(new_sample)
+            assert len(new_data_samples) == result.size(0)
+            data_samples = new_data_samples
+        elif mode == 'i2t':
+            for sample in data_samples:
+                if 'gt_text_id' in sample:
+                    sample.gt_label = sample.gt_text_id
+        else:
+            raise ValueError(f'Type {mode} is not supported.')
+
+        for data_sample, score in zip(data_samples, result):
+            idx = score.argmax(keepdim=True).detach()
+
+            data_sample.set_pred_score(score)
+            data_sample.set_pred_label(idx)
+        return data_samples
diff --git a/mmaction/models/multimodal/vindlu/vindlu_ret_mc.py b/mmaction/models/multimodal/vindlu/vindlu_ret_mc.py
new file mode 100644
index 0000000000000000000000000000000000000000..937081575928e3aa70a762ef9571e2e45704a31c
--- /dev/null
+++ b/mmaction/models/multimodal/vindlu/vindlu_ret_mc.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+
+from mmaction.registry import MODELS
+from .vindlu_ret import VindLURetrieval
+
+
+@MODELS.register_module()
+class VindLURetrievalMC(VindLURetrieval):
+    """VindLU VQA retrieval multiple choice.
+
+    score_weight (float): Weight coefficient for itm_head score to compute the
+    choice score. similarity_weight (float): Weight coefficient for similarity
+    score to compute the     choice score.
+    """
+
+    def __init__(self, score_weight=0.7, similarity_weight=0.3, **kwargs):
+        kwargs.pop('text_decoder')
+        super().__init__(**kwargs)
+        self.score_weight = score_weight
+        self.similarity_weight = similarity_weight
+
+    def predict(self, inputs, data_samples, **kwargs):
+        """Predict captions from a batch of inputs.
+
+        Args:
+            images (torch.Tensor): The input images tensor with shape
+                (N, C, ...) in general.
+            data_samples (List[DataSample], optional): The annotation
+                data of every samples. Defaults to None.
+            **kwargs: Other keyword arguments accepted by the ``predict``
+
+        Returns:
+            List[ActionDataSample]: Return list of data samples.
+        """
+        num_options_per_q = len(data_samples[0].caption_options)
+        for sample in data_samples:
+            sample.text = sample.caption_options
+
+        output = self.extract_feat(inputs, data_samples)
+
+        text_embeds = output['text_embeds']
+        text_attn_mask = output['text_attn_mask']
+        image_embeds = output['image_embeds']
+        image_feat = output['image_feat']
+        text_feat = output['text_feat']
+
+        # compute similarity between vision feat and caption feat
+        text_feat = rearrange(
+            text_feat, '(b n) c -> b c n', n=num_options_per_q)
+        sim = torch.matmul(image_feat.mean(1, keepdim=True),
+                           text_feat).squeeze(1) / self.temp
+        sim = F.softmax(sim, dim=1).flatten()
+
+        # cross-modal encode
+        encoder_output = image_embeds.repeat_interleave(
+            num_options_per_q, dim=0)
+        image_atts = torch.ones(
+            encoder_output.size()[:-1], dtype=torch.long).to(inputs.device)
+        output = self.text_encoder(
+            encoder_embeds=text_embeds,
+            attention_mask=text_attn_mask,
+            encoder_hidden_states=encoder_output,
+            encoder_attention_mask=image_atts,
+            return_dict=True,
+            mode='fusion',
+        )
+        itm_embeds = output.last_hidden_state[:, 0]  # [CLS]
+
+        itm_score = F.softmax(self.itm_head(itm_embeds), dim=1)[:, 1]  # [bs*5]
+        score = itm_score * self.score_weight + sim * self.similarity_weight
+
+        pred_answers = score.view(-1, num_options_per_q).max(1)[1].cpu()
+
+        # assemble predictions
+        ensemble_scores = score.view(-1, num_options_per_q).cpu()  # (bsz, 5)
+
+        out_data_samples = []
+        for data_sample, ensemble_score, pred_ans in \
+                zip(data_samples, ensemble_scores, pred_answers):
+            data_sample.pred_label = pred_ans.item()
+            data_sample.score = ensemble_score.numpy()
+            out_data_samples.append(data_sample)
+
+        return out_data_samples
diff --git a/mmaction/models/multimodal/vindlu/vindlu_vqa.py b/mmaction/models/multimodal/vindlu/vindlu_vqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c1d8a127b64eec3bcd5c51ee24b3f3ee234ac1e
--- /dev/null
+++ b/mmaction/models/multimodal/vindlu/vindlu_vqa.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import mmengine
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+
+from mmaction.registry import MODELS
+from .vindlu import VindLUBase
+
+
+@MODELS.register_module()
+class VindLUVQA(VindLUBase):
+    """VindLU VQA.
+
+    Args:
+        text_decoder (dict): Backbone for extracting
+            multi-modal features. We apply this part as VQA fusion module.
+        answer_list_path (str, optional): Path to `answer_list.json`.
+        max_question_len (int): Max text length of question text.
+            Defaults to 25.
+        max_answer_len (int): Max text length of answer text. Defaults to 5.
+        num_ans_candidates (int): Number of answer candidates, used to filter
+            out answers with low probability. Defaults to 128.
+        **kwargs: Other keyword arguments accepted by the VindLUBase.
+    """
+
+    def __init__(self,
+                 text_decoder: dict,
+                 answer_list_path: Optional[str] = None,
+                 max_question_len: int = 25,
+                 max_answer_len: int = 5,
+                 num_ans_candidates: int = 128,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.max_question_len = max_question_len
+        self.max_answer_len = max_answer_len
+        self.num_ans_candidates = num_ans_candidates
+        self.answer_list_path = answer_list_path
+        self.text_decoder_cfg = text_decoder
+
+        # for inference only
+        if answer_list_path:
+            self.answer_list = mmengine.load(answer_list_path)
+
+        # delete extra/unnecessary modules inherited from VindLUBase
+        extra_attributes = ['vision_proj', 'text_proj', 'temp', 'itm_head']
+        for attr in extra_attributes:
+            delattr(self, attr)
+
+        self.text_decoder_cfg.gradient_checkpointing = \
+            self.gradient_checkpointing
+        self.text_decoder = MODELS.build(self.text_decoder_cfg)
+
+    def forward_encoder(self, inputs, data_samples):
+        # forward vision encoder
+        image_embeds, _ = self.encode_vision(inputs)
+        image_embeds = rearrange(image_embeds, 'b t l c -> b (t l) c')
+        image_atts = torch.ones(
+            image_embeds.size()[:-1], dtype=torch.long).to(inputs.device)
+
+        # forward text encoder
+        questions = [sample.question for sample in data_samples]
+        questions = self.tokenizer(
+            questions,
+            padding='max_length',
+            truncation=True,
+            max_length=self.max_question_len,
+            return_tensors='pt').to(inputs.device)
+
+        question_output = self.text_encoder(
+            questions.input_ids,
+            attention_mask=questions.attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=True)
+
+        return questions, question_output
+
+    def loss(self, inputs, data_samples):
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (dict): A batch of inputs. The input tensor with of
+                at least one modality. For image, the value is a tensor
+                of shape (N, C, ...) in general.
+                For text, the value is a dict of tokenized text inputs.
+            data_samples (Optional[List[DataSample]]):
+                The annotation data of every samples. Defaults to None.
+
+        Returns:
+            Dict[str, torch.tensor]: a dictionary of loss components of
+        """
+
+        questions, question_output = self.forward_encoder(inputs, data_samples)
+
+        weights = torch.cat(
+            [torch.tensor(sample.gt_answer_weight) for sample in data_samples],
+            dim=0).to(inputs.device)
+        raw_answers = []
+        for sample in data_samples:
+            raw_answers.extend(sample.gt_answer)
+        answer_count = torch.tensor([
+            len(sample.gt_answer) for sample in data_samples
+        ]).to(inputs.device)
+        answers = [a + ' ' + '[SEP]' for a in raw_answers]
+        answers = self.tokenizer(
+            answers,
+            padding='max_length',
+            truncation=True,
+            max_length=self.max_answer_len,
+            return_tensors='pt').to(inputs.device)
+
+        answer_targets = answers.input_ids.masked_fill(
+            answers.input_ids == self.tokenizer.pad_token_id, -100)
+
+        question_states = []
+        question_atts = []
+        for b, n in enumerate(answer_count):
+            question_states += [question_output.last_hidden_state[b]] * n
+            question_atts += [questions.attention_mask[b]] * n
+        question_states = torch.stack(question_states, 0).to(inputs.device)
+        question_atts = torch.stack(question_atts, 0).to(inputs.device)
+
+        answer_output = self.text_decoder(
+            answers.input_ids,
+            attention_mask=answers.attention_mask,
+            encoder_hidden_states=question_states,
+            encoder_attention_mask=question_atts,
+            labels=answer_targets,
+            return_dict=True,
+            reduction='none',
+        )
+        loss = weights * answer_output.loss
+        loss = loss.sum() / inputs.size(0)
+
+        return dict(loss=loss)
+
+    def predict(self, inputs, data_samples, **kwargs):
+
+        questions, question_output = self.forward_encoder(inputs, data_samples)
+
+        raw_answers = self.answer_list
+        answers = [a + ' ' + '[SEP]' for a in raw_answers]
+        answers = self.tokenizer(
+            answers,
+            padding='max_length',
+            truncation=True,
+            max_length=self.max_answer_len,
+            return_tensors='pt',
+        ).to(inputs.device)
+
+        topk_ids, topk_probs = self.rank_answer(
+            question_output.last_hidden_state, questions.attention_mask,
+            answers.input_ids, answers.attention_mask, self.num_ans_candidates)
+
+        out_data_samples = []
+        for data_sample, topk_id, topk_prob in zip(data_samples, topk_ids,
+                                                   topk_probs):
+            _, pred = topk_prob.max(dim=0)
+            data_sample.pred_answer = raw_answers[topk_id[pred]]
+            out_data_samples.append(data_sample)
+
+        return out_data_samples
+
+    def rank_answer(self, question_states, question_atts, answer_ids,
+                    answer_atts, k):
+        """
+        question_states: (bsz, Lq, d)
+        answer_ids: answer input id after tokenization, (#answers, La)
+        """
+        num_ques = question_states.size(0)
+        start_ids = answer_ids[0, 0].repeat(num_ques, 1)  # bos token
+
+        start_output = self.text_decoder(
+            start_ids,
+            encoder_hidden_states=question_states,
+            encoder_attention_mask=question_atts,
+            return_dict=True,
+            reduction='none',
+        )
+        logits = start_output.logits[:, 0, :]  # first token's logit
+
+        # topk_probs: top-k probability
+        # topk_ids: [num_question, k]
+        answer_first_token = answer_ids[:, 1]
+        prob_first_token = F.softmax(
+            logits, dim=1).index_select(
+                dim=1, index=answer_first_token)
+        topk_probs, topk_ids = prob_first_token.topk(k, dim=1)
+
+        # answer input: [num_question*k, answer_len]
+        input_ids = []
+        input_atts = []
+        for b, topk_id in enumerate(topk_ids):
+            input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
+            input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
+        input_ids = torch.cat(input_ids, dim=0)
+        input_atts = torch.cat(input_atts, dim=0)
+
+        targets_ids = input_ids.masked_fill(
+            input_ids == self.tokenizer.pad_token_id, -100)
+
+        question_states = question_states.repeat_interleave(k, dim=0)
+        question_atts = question_atts.repeat_interleave(k, dim=0)
+
+        output = self.text_decoder(
+            input_ids,
+            attention_mask=input_atts,
+            encoder_hidden_states=question_states,
+            encoder_attention_mask=question_atts,
+            labels=targets_ids,
+            return_dict=True,
+            reduction='none',
+        )
+
+        answer_loss = output.loss
+        answer_loss = answer_loss.view(input_ids.size(0), -1)
+
+        # topk_prob: first token probability
+        topk_probs = topk_probs.view(-1, 1)
+        log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1)
+
+        # re-calculate log probabilities for the answer sequences
+        # using chain rule
+        log_probs_sum = log_probs.sum(1)
+        log_probs_sum = log_probs_sum.view(num_ques, k)
+
+        topk_probs = F.softmax(log_probs_sum, dim=-1)
+        # get top-k after re-ranking
+        topk_probs, rerank_id = topk_probs.topk(k, dim=1)
+        topk_ids = torch.gather(topk_ids, 1, rerank_id)
+
+        return topk_ids, topk_probs
+
+    def preprocess_state_dict(self, state_dict):
+        """Preprocess pretrained checkpoint for text_encoder and
+        text_decoder."""
+        for key in list(state_dict.keys()):
+            if 'bert' in key:
+                encoder_key = key.replace('bert.', '')
+                state_dict[encoder_key] = state_dict[key]
+
+            # init text decoder as multimodal encoder
+            # (last 6 layers of model.text_encoder)
+            # only for generation tasks like VQA
+            if self.text_decoder_cfg and 'text_encoder' in key:
+                if 'layer' in key:
+                    encoder_keys = key.split('.')
+                    layer_num = int(encoder_keys[4])
+                    if layer_num < self.text_encoder_cfg.fusion_layer:
+                        del state_dict[key]
+                        continue
+                    else:
+                        decoder_layer_num = layer_num - 9
+                        encoder_keys[4] = str(decoder_layer_num)
+                        encoder_key = '.'.join(encoder_keys)
+                else:
+                    encoder_key = key
+                decoder_key = encoder_key.replace('text_encoder',
+                                                  'text_decoder')
+                state_dict[decoder_key] = state_dict[key]
+                del state_dict[key]
+        return state_dict
diff --git a/mmaction/models/multimodal/vindlu/xbert.py b/mmaction/models/multimodal/vindlu/xbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..783d7413dd3155b5ddd3f28c578bfc7345c21493
--- /dev/null
+++ b/mmaction/models/multimodal/vindlu/xbert.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmaction.registry import MODELS
+from .modeling_bert import (BertConfig, BertForMaskedLM, BertLMHeadModel,
+                            BertModel)
+
+
+@MODELS.register_module()
+class XBertForMaskedLM(BertForMaskedLM):
+
+    def __init__(self, pretrained_model_name_or_path, fusion_layer,
+                 encoder_width, **kwargs):
+        config = BertConfig.from_pretrained(pretrained_model_name_or_path)
+        config.fusion_layer = fusion_layer
+        config.encoder_width = encoder_width
+        config.update(kwargs)
+        super().__init__(config)
+
+
+@MODELS.register_module()
+class XBertModel(BertModel):
+
+    def __init__(self, pretrained_model_name_or_path, fusion_layer,
+                 encoder_width, add_pooling_layer, **kwargs):
+        config = BertConfig.from_pretrained(pretrained_model_name_or_path)
+        config.fusion_layer = fusion_layer
+        config.encoder_width = encoder_width
+        config.update(kwargs)
+        super().__init__(config, add_pooling_layer)
+
+
+@MODELS.register_module()
+class BertDecoder(BertLMHeadModel):
+
+    def __init__(self, pretrained_model_name_or_path, fusion_layer,
+                 encoder_width, **kwargs):
+        config = BertConfig.from_pretrained(pretrained_model_name_or_path)
+        config.fusion_layer = fusion_layer
+        config.encoder_width = encoder_width
+        config.update(kwargs)
+        super().__init__(config)
diff --git a/mmaction/models/necks/__init__.py b/mmaction/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..17220b517e63b9cc44f08446dbfe20bfc4f1c0f2
--- /dev/null
+++ b/mmaction/models/necks/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .tpn import TPN
+
+__all__ = ['TPN']
diff --git a/mmaction/models/necks/tpn.py b/mmaction/models/necks/tpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d44ce1c5e2250d9f42638878131ef06c255844d
--- /dev/null
+++ b/mmaction/models/necks/tpn.py
@@ -0,0 +1,476 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model.weight_init import constant_init, normal_init, xavier_init
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType, OptConfigType, SampleList
+
+
+class DownSample(nn.Module):
+    """DownSample modules.
+
+    It uses convolution and maxpooling to downsample the input feature,
+    and specifies downsample position to determine `pool-conv` or `conv-pool`.
+
+    Args:
+        in_channels (int): Channel number of input features.
+        out_channels (int): Channel number of output feature.
+        kernel_size (int or Tuple[int]): Same as :class:`ConvModule`.
+            Defaults to ``(3, 1, 1)``.
+        stride (int or Tuple[int]): Same as :class:`ConvModule`.
+            Defaults to ``(1, 1, 1)``.
+        padding (int or Tuple[int]): Same as :class:`ConvModule`.
+            Defaults to ``(1, 0, 0)``.
+        groups (int): Same as :class:`ConvModule`. Defaults to 1.
+        bias (bool or str): Same as :class:`ConvModule`. Defaults to False.
+        conv_cfg (dict or ConfigDict): Same as :class:`ConvModule`.
+            Defaults to ``dict(type='Conv3d')``.
+        norm_cfg (dict or ConfigDict, optional): Same as :class:`ConvModule`.
+            Defaults to None.
+        act_cfg (dict or ConfigDict, optional): Same as :class:`ConvModule`.
+            Defaults to None.
+        downsample_position (str): Type of downsample position. Options are
+            ``before`` and ``after``. Defaults to ``after``.
+        downsample_scale (int or Tuple[int]): downsample scale for maxpooling.
+            It will be used for kernel size and stride of maxpooling.
+            Defaults to ``(1, 2, 2)``.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int]] = (3, 1, 1),
+        stride: Union[int, Tuple[int]] = (1, 1, 1),
+        padding: Union[int, Tuple[int]] = (1, 0, 0),
+        groups: int = 1,
+        bias: Union[bool, str] = False,
+        conv_cfg: ConfigType = dict(type='Conv3d'),
+        norm_cfg: OptConfigType = None,
+        act_cfg: OptConfigType = None,
+        downsample_position: str = 'after',
+        downsample_scale: Union[int, Tuple[int]] = (1, 2, 2)
+    ) -> None:
+        super().__init__()
+        self.conv = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            groups=groups,
+            bias=bias,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        assert downsample_position in ['before', 'after']
+        self.downsample_position = downsample_position
+        self.pool = nn.MaxPool3d(
+            downsample_scale, downsample_scale, (0, 0, 0), ceil_mode=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        if self.downsample_position == 'before':
+            x = self.pool(x)
+            x = self.conv(x)
+        else:
+            x = self.conv(x)
+            x = self.pool(x)
+        return x
+
+
+class LevelFusion(nn.Module):
+    """Level Fusion module.
+
+    This module is used to aggregate the hierarchical features dynamic in
+    visual tempos and consistent in spatial semantics. The top/bottom features
+    for top-down/bottom-up flow would be combined to achieve two additional
+    options, namely 'Cascade Flow' or 'Parallel Flow'. While applying a
+    bottom-up flow after a top-down flow will lead to the cascade flow,
+    applying them simultaneously will result in the parallel flow.
+
+    Args:
+        in_channels (Tuple[int]): Channel numbers of input features tuple.
+        mid_channels (Tuple[int]): Channel numbers of middle features tuple.
+        out_channels (int): Channel numbers of output features.
+        downsample_scales (Tuple[int | Tuple[int]]): downsample scales for
+            each :class:`DownSample` module.
+            Defaults to ``((1, 1, 1), (1, 1, 1))``.
+    """
+
+    def __init__(
+        self,
+        in_channels: Tuple[int],
+        mid_channels: Tuple[int],
+        out_channels: int,
+        downsample_scales: Tuple[int, Tuple[int]] = ((1, 1, 1), (1, 1, 1))
+    ) -> None:
+        super().__init__()
+        num_stages = len(in_channels)
+
+        self.downsamples = nn.ModuleList()
+        for i in range(num_stages):
+            downsample = DownSample(
+                in_channels[i],
+                mid_channels[i],
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                bias=False,
+                padding=(0, 0, 0),
+                groups=32,
+                norm_cfg=dict(type='BN3d', requires_grad=True),
+                act_cfg=dict(type='ReLU', inplace=True),
+                downsample_position='before',
+                downsample_scale=downsample_scales[i])
+            self.downsamples.append(downsample)
+
+        self.fusion_conv = ConvModule(
+            sum(mid_channels),
+            out_channels,
+            1,
+            stride=1,
+            padding=0,
+            bias=False,
+            conv_cfg=dict(type='Conv3d'),
+            norm_cfg=dict(type='BN3d', requires_grad=True),
+            act_cfg=dict(type='ReLU', inplace=True))
+
+    def forward(self, x: Tuple[torch.Tensor]) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        out = [self.downsamples[i](feature) for i, feature in enumerate(x)]
+        out = torch.cat(out, 1)
+        out = self.fusion_conv(out)
+
+        return out
+
+
+class SpatialModulation(nn.Module):
+    """Spatial Semantic Modulation.
+
+    This module is used to align spatial semantics of features in the
+    multi-depth pyramid. For each but the top-level feature, a stack
+    of convolutions with level-specific stride are applied to it, matching
+    its spatial shape and receptive field with the top one.
+
+    Args:
+        in_channels (Tuple[int]): Channel numbers of input features tuple.
+        out_channels (int): Channel numbers of output features tuple.
+    """
+
+    def __init__(self, in_channels: Tuple[int], out_channels: int) -> None:
+        super().__init__()
+
+        self.spatial_modulation = nn.ModuleList()
+        for channel in in_channels:
+            downsample_scale = out_channels // channel
+            downsample_factor = int(np.log2(downsample_scale))
+            op = nn.ModuleList()
+            if downsample_factor < 1:
+                op = nn.Identity()
+            else:
+                for factor in range(downsample_factor):
+                    in_factor = 2**factor
+                    out_factor = 2**(factor + 1)
+                    op.append(
+                        ConvModule(
+                            channel * in_factor,
+                            channel * out_factor, (1, 3, 3),
+                            stride=(1, 2, 2),
+                            padding=(0, 1, 1),
+                            bias=False,
+                            conv_cfg=dict(type='Conv3d'),
+                            norm_cfg=dict(type='BN3d', requires_grad=True),
+                            act_cfg=dict(type='ReLU', inplace=True)))
+            self.spatial_modulation.append(op)
+
+    def forward(self, x: Tuple[torch.Tensor]) -> list:
+        """Defines the computation performed at every call."""
+        out = []
+        for i, _ in enumerate(x):
+            if isinstance(self.spatial_modulation[i], nn.ModuleList):
+                out_ = x[i]
+                for op in self.spatial_modulation[i]:
+                    out_ = op(out_)
+                out.append(out_)
+            else:
+                out.append(self.spatial_modulation[i](x[i]))
+        return out
+
+
+class AuxHead(nn.Module):
+    """Auxiliary Head.
+
+    This auxiliary head is appended to receive stronger supervision,
+    leading to enhanced semantics.
+
+    Args:
+        in_channels (int): Channel number of input features.
+        out_channels (int): Channel number of output features.
+        loss_weight (float): weight of loss for the auxiliary head.
+            Defaults to 0.5.
+        loss_cls (dict or ConfigDict): Config for building loss.
+            Defaults to ``dict(type='CrossEntropyLoss')``.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        loss_weight: float = 0.5,
+        loss_cls: ConfigType = dict(type='CrossEntropyLoss')
+    ) -> None:
+        super().__init__()
+
+        self.conv = ConvModule(
+            in_channels,
+            in_channels * 2, (1, 3, 3),
+            stride=(1, 2, 2),
+            padding=(0, 1, 1),
+            bias=False,
+            conv_cfg=dict(type='Conv3d'),
+            norm_cfg=dict(type='BN3d', requires_grad=True))
+        self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        self.loss_weight = loss_weight
+        self.dropout = nn.Dropout(p=0.5)
+        self.fc = nn.Linear(in_channels * 2, out_channels)
+        self.loss_cls = MODELS.build(loss_cls)
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                normal_init(m, std=0.01)
+            if isinstance(m, nn.Conv3d):
+                xavier_init(m, distribution='uniform')
+            if isinstance(m, nn.BatchNorm3d):
+                constant_init(m, 1)
+
+    def loss(self, x: torch.Tensor,
+             data_samples: Optional[SampleList]) -> dict:
+        """Calculate auxiliary loss."""
+        x = self(x)
+        labels = [x.gt_label for x in data_samples]
+        labels = torch.stack(labels).to(x.device)
+        labels = labels.squeeze()
+        if labels.shape == torch.Size([]):
+            labels = labels.unsqueeze(0)
+
+        losses = dict()
+        losses['loss_aux'] = self.loss_weight * self.loss_cls(x, labels)
+        return losses
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Auxiliary head forward function."""
+        x = self.conv(x)
+        x = self.avg_pool(x).squeeze(-1).squeeze(-1).squeeze(-1)
+        x = self.dropout(x)
+        x = self.fc(x)
+
+        return x
+
+
+class TemporalModulation(nn.Module):
+    """Temporal Rate Modulation.
+
+    The module is used to equip TPN with a similar flexibility for temporal
+    tempo modulation as in the input-level frame pyramid.
+
+    Args:
+        in_channels (int): Channel number of input features.
+        out_channels (int): Channel number of output features.
+        downsample_scale (int): Downsample scale for maxpooling. Defaults to 8.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 downsample_scale: int = 8) -> None:
+        super().__init__()
+
+        self.conv = ConvModule(
+            in_channels,
+            out_channels, (3, 1, 1),
+            stride=(1, 1, 1),
+            padding=(1, 0, 0),
+            bias=False,
+            groups=32,
+            conv_cfg=dict(type='Conv3d'),
+            act_cfg=None)
+        self.pool = nn.MaxPool3d((downsample_scale, 1, 1),
+                                 (downsample_scale, 1, 1), (0, 0, 0),
+                                 ceil_mode=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        x = self.conv(x)
+        x = self.pool(x)
+        return x
+
+
+@MODELS.register_module()
+class TPN(nn.Module):
+    """TPN neck.
+
+    This module is proposed in `Temporal Pyramid Network for Action Recognition
+    <https://arxiv.org/pdf/2004.03548.pdf>`_
+
+    Args:
+        in_channels (Tuple[int]): Channel numbers of input features tuple.
+        out_channels (int): Channel number of output feature.
+        spatial_modulation_cfg (dict or ConfigDict, optional): Config for
+            spatial modulation layers. Required keys are ``in_channels`` and
+            ``out_channels``. Defaults to None.
+        temporal_modulation_cfg (dict or ConfigDict, optional): Config for
+            temporal modulation layers. Defaults to None.
+        upsample_cfg (dict or ConfigDict, optional): Config for upsample
+            layers. The keys are same as that in :class:``nn.Upsample``.
+            Defaults to None.
+        downsample_cfg (dict or ConfigDict, optional): Config for downsample
+            layers. Defaults to None.
+        level_fusion_cfg (dict or ConfigDict, optional): Config for level
+            fusion layers.
+            Required keys are ``in_channels``, ``mid_channels``,
+            ``out_channels``. Defaults to None.
+        aux_head_cfg (dict or ConfigDict, optional): Config for aux head
+            layers. Required keys are ``out_channels``. Defaults to None.
+        flow_type (str): Flow type to combine the features. Options are
+            ``cascade`` and ``parallel``. Defaults to ``cascade``.
+    """
+
+    def __init__(self,
+                 in_channels: Tuple[int],
+                 out_channels: int,
+                 spatial_modulation_cfg: OptConfigType = None,
+                 temporal_modulation_cfg: OptConfigType = None,
+                 upsample_cfg: OptConfigType = None,
+                 downsample_cfg: OptConfigType = None,
+                 level_fusion_cfg: OptConfigType = None,
+                 aux_head_cfg: OptConfigType = None,
+                 flow_type: str = 'cascade') -> None:
+        super().__init__()
+        assert isinstance(in_channels, tuple)
+        assert isinstance(out_channels, int)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_tpn_stages = len(in_channels)
+
+        assert spatial_modulation_cfg is None or isinstance(
+            spatial_modulation_cfg, dict)
+        assert temporal_modulation_cfg is None or isinstance(
+            temporal_modulation_cfg, dict)
+        assert upsample_cfg is None or isinstance(upsample_cfg, dict)
+        assert downsample_cfg is None or isinstance(downsample_cfg, dict)
+        assert aux_head_cfg is None or isinstance(aux_head_cfg, dict)
+        assert level_fusion_cfg is None or isinstance(level_fusion_cfg, dict)
+
+        if flow_type not in ['cascade', 'parallel']:
+            raise ValueError(
+                f"flow type in TPN should be 'cascade' or 'parallel', "
+                f'but got {flow_type} instead.')
+        self.flow_type = flow_type
+
+        self.temporal_modulation_ops = nn.ModuleList()
+        self.upsample_ops = nn.ModuleList()
+        self.downsample_ops = nn.ModuleList()
+
+        self.level_fusion_1 = LevelFusion(**level_fusion_cfg)
+        self.spatial_modulation = SpatialModulation(**spatial_modulation_cfg)
+
+        for i in range(self.num_tpn_stages):
+
+            if temporal_modulation_cfg is not None:
+                downsample_scale = temporal_modulation_cfg[
+                    'downsample_scales'][i]
+                temporal_modulation = TemporalModulation(
+                    in_channels[-1], out_channels, downsample_scale)
+                self.temporal_modulation_ops.append(temporal_modulation)
+
+            if i < self.num_tpn_stages - 1:
+                if upsample_cfg is not None:
+                    upsample = nn.Upsample(**upsample_cfg)
+                    self.upsample_ops.append(upsample)
+
+                if downsample_cfg is not None:
+                    downsample = DownSample(out_channels, out_channels,
+                                            **downsample_cfg)
+                    self.downsample_ops.append(downsample)
+
+        out_dims = level_fusion_cfg['out_channels']
+
+        # two pyramids
+        self.level_fusion_2 = LevelFusion(**level_fusion_cfg)
+
+        self.pyramid_fusion = ConvModule(
+            out_dims * 2,
+            2048,
+            1,
+            stride=1,
+            padding=0,
+            bias=False,
+            conv_cfg=dict(type='Conv3d'),
+            norm_cfg=dict(type='BN3d', requires_grad=True))
+
+        if aux_head_cfg is not None:
+            self.aux_head = AuxHead(self.in_channels[-2], **aux_head_cfg)
+        else:
+            self.aux_head = None
+
+    def init_weights(self) -> None:
+        """Default init_weights for conv(msra) and norm in ConvModule."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                xavier_init(m, distribution='uniform')
+            if isinstance(m, nn.BatchNorm3d):
+                constant_init(m, 1)
+
+        if self.aux_head is not None:
+            self.aux_head.init_weights()
+
+    def forward(self,
+                x: Tuple[torch.Tensor],
+                data_samples: Optional[SampleList] = None) -> tuple:
+        """Defines the computation performed at every call."""
+
+        loss_aux = dict()
+        # Calculate auxiliary loss if `self.aux_head`
+        # and `data_samples` are not None.
+        if self.aux_head is not None and data_samples is not None:
+            loss_aux = self.aux_head.loss(x[-2], data_samples)
+
+        # Spatial Modulation
+        spatial_modulation_outs = self.spatial_modulation(x)
+
+        # Temporal Modulation
+        temporal_modulation_outs = []
+        for i, temporal_modulation in enumerate(self.temporal_modulation_ops):
+            temporal_modulation_outs.append(
+                temporal_modulation(spatial_modulation_outs[i]))
+
+        outs = [out.clone() for out in temporal_modulation_outs]
+        if len(self.upsample_ops) != 0:
+            for i in range(self.num_tpn_stages - 1, 0, -1):
+                outs[i - 1] = outs[i - 1] + self.upsample_ops[i - 1](outs[i])
+
+        # Get top-down outs
+        top_down_outs = self.level_fusion_1(outs)
+
+        # Build bottom-up flow using downsample operation
+        if self.flow_type == 'parallel':
+            outs = [out.clone() for out in temporal_modulation_outs]
+        if len(self.downsample_ops) != 0:
+            for i in range(self.num_tpn_stages - 1):
+                outs[i + 1] = outs[i + 1] + self.downsample_ops[i](outs[i])
+
+        # Get bottom-up outs
+        botton_up_outs = self.level_fusion_2(outs)
+
+        # fuse two pyramid outs
+        outs = self.pyramid_fusion(
+            torch.cat([top_down_outs, botton_up_outs], 1))
+
+        return outs, loss_aux
diff --git a/mmaction/models/recognizers/__init__.py b/mmaction/models/recognizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ef36b8e3d0e19ae5315a2479e687ab4d2f09c7c
--- /dev/null
+++ b/mmaction/models/recognizers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseRecognizer
+from .recognizer2d import Recognizer2D
+from .recognizer3d import Recognizer3D
+from .recognizer3d_mm import MMRecognizer3D
+from .recognizer_audio import RecognizerAudio
+from .recognizer_gcn import RecognizerGCN
+from .recognizer_omni import RecognizerOmni
+
+__all__ = [
+    'BaseRecognizer', 'RecognizerGCN', 'Recognizer2D', 'Recognizer3D',
+    'RecognizerAudio', 'RecognizerOmni', 'MMRecognizer3D'
+]
diff --git a/mmaction/models/recognizers/base.py b/mmaction/models/recognizers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ba7216c33b7f836c9f53c7f7e2b8440ca5b8976
--- /dev/null
+++ b/mmaction/models/recognizers/base.py
@@ -0,0 +1,265 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import warnings
+from abc import ABCMeta, abstractmethod
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModel, merge_dict
+
+from mmaction.registry import MODELS
+from mmaction.utils import (ConfigType, ForwardResults, OptConfigType,
+                            OptSampleList, SampleList)
+
+
+class BaseRecognizer(BaseModel, metaclass=ABCMeta):
+    """Base class for recognizers.
+
+    Args:
+        backbone (Union[ConfigDict, dict]): Backbone modules to
+            extract feature.
+        cls_head (Union[ConfigDict, dict], optional): Classification head to
+            process feature. Defaults to None.
+        neck (Union[ConfigDict, dict], optional): Neck for feature fusion.
+            Defaults to None.
+        train_cfg (Union[ConfigDict, dict], optional): Config for training.
+            Defaults to None.
+        test_cfg (Union[ConfigDict, dict], optional): Config for testing.
+            Defaults to None.
+        data_preprocessor (Union[ConfigDict, dict], optional): The pre-process
+           config of :class:`ActionDataPreprocessor`.  it usually includes,
+            ``mean``, ``std`` and ``format_shape``. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 cls_head: OptConfigType = None,
+                 neck: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None) -> None:
+        if data_preprocessor is None:
+            # This preprocessor will only stack batch data samples.
+            data_preprocessor = dict(type='ActionDataPreprocessor')
+
+        super(BaseRecognizer,
+              self).__init__(data_preprocessor=data_preprocessor)
+
+        def is_from(module, pkg_name):
+            # check whether the backbone is from pkg
+            model_type = module['type']
+            if isinstance(model_type, str):
+                return model_type.startswith(pkg_name)
+            elif inspect.isclass(model_type) or inspect.isfunction(model_type):
+                module_name = model_type.__module__
+                return pkg_name in module_name
+            else:
+                raise TypeError(
+                    f'Unsupported type of module {type(module["type"])}')
+
+        # Record the source of the backbone.
+        self.backbone_from = 'mmaction2'
+        if is_from(backbone, 'mmcls.'):
+            try:
+                # Register all mmcls models.
+                import mmcls.models  # noqa: F401
+            except (ImportError, ModuleNotFoundError):
+                raise ImportError('Please install mmcls to use this backbone.')
+            self.backbone = MODELS.build(backbone)
+            self.backbone_from = 'mmcls'
+        elif is_from(backbone, 'mmpretrain.'):
+            try:
+                # Register all mmpretrain models.
+                import mmpretrain.models  # noqa: F401
+            except (ImportError, ModuleNotFoundError):
+                raise ImportError(
+                    'Please install mmpretrain to use this backbone.')
+            self.backbone = MODELS.build(backbone)
+            self.backbone_from = 'mmpretrain'
+        elif is_from(backbone, 'torchvision.'):
+            try:
+                import torchvision.models
+            except (ImportError, ModuleNotFoundError):
+                raise ImportError('Please install torchvision to use this '
+                                  'backbone.')
+            self.backbone_from = 'torchvision'
+            self.feature_shape = backbone.pop('feature_shape', None)
+            backbone_type = backbone.pop('type')
+            if isinstance(backbone_type, str):
+                backbone_type = backbone_type[12:]
+                self.backbone = torchvision.models.__dict__[backbone_type](
+                    **backbone)
+            else:
+                self.backbone = backbone_type(**backbone)
+            # disable the classifier
+            self.backbone.classifier = nn.Identity()
+            self.backbone.fc = nn.Identity()
+        elif is_from(backbone, 'timm.'):
+            # currently, only support use `str` as backbone type
+            try:
+                import timm
+            except (ImportError, ModuleNotFoundError):
+                raise ImportError('Please install timm>=0.9.0 to use this '
+                                  'backbone.')
+            self.backbone_from = 'timm'
+            self.feature_shape = backbone.pop('feature_shape', None)
+            # disable the classifier
+            backbone['num_classes'] = 0
+            backbone_type = backbone.pop('type')
+            if isinstance(backbone_type, str):
+                backbone_type = backbone_type[5:]
+                self.backbone = timm.create_model(backbone_type, **backbone)
+            else:
+                raise TypeError(
+                    f'Unsupported timm backbone type: {type(backbone_type)}')
+        else:
+            self.backbone = MODELS.build(backbone)
+
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+
+        if cls_head is not None:
+            self.cls_head = MODELS.build(cls_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    @abstractmethod
+    def extract_feat(self, inputs: torch.Tensor, **kwargs) -> ForwardResults:
+        """Extract features from raw inputs."""
+
+    @property
+    def with_neck(self) -> bool:
+        """bool: whether the recognizer has a neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_cls_head(self) -> bool:
+        """bool: whether the recognizer has a cls_head"""
+        return hasattr(self, 'cls_head') and self.cls_head is not None
+
+    def init_weights(self) -> None:
+        """Initialize the model network weights."""
+        if self.backbone_from in ['torchvision', 'timm']:
+            warnings.warn('We do not initialize weights for backbones in '
+                          f'{self.backbone_from}, since the weights for '
+                          f'backbones in {self.backbone_from} are initialized '
+                          'in their __init__ functions.')
+
+            def fake_init():
+                pass
+
+            # avoid repeated initialization
+            self.backbone.init_weights = fake_init
+        super().init_weights()
+
+    def loss(self, inputs: torch.Tensor, data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (torch.Tensor): Raw Inputs of the recognizer.
+                These should usually be mean centered and std scaled.
+            data_samples (List[``ActionDataSample``]): The batch
+                data samples. It usually includes information such
+                as ``gt_label``.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        feats, loss_kwargs = \
+            self.extract_feat(inputs,
+                              data_samples=data_samples)
+
+        # loss_aux will be a empty dict if `self.with_neck` is False.
+        loss_aux = loss_kwargs.get('loss_aux', dict())
+        loss_cls = self.cls_head.loss(feats, data_samples, **loss_kwargs)
+        losses = merge_dict(loss_cls, loss_aux)
+        return losses
+
+    def predict(self, inputs: torch.Tensor, data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (torch.Tensor): Raw Inputs of the recognizer.
+                These should usually be mean centered and std scaled.
+            data_samples (List[``ActionDataSample``]): The batch
+                data samples. It usually includes information such
+                as ``gt_label``.
+
+        Returns:
+            List[``ActionDataSample``]: Return the recognition results.
+            The returns value is ``ActionDataSample``, which usually contains
+            ``pred_scores``. And the ``pred_scores`` usually contains
+            following keys.
+
+                - item (torch.Tensor): Classification scores, has a shape
+                    (num_classes, )
+        """
+        feats, predict_kwargs = self.extract_feat(inputs, test_mode=True)
+        predictions = self.cls_head.predict(feats, data_samples,
+                                            **predict_kwargs)
+        return predictions
+
+    def _forward(self,
+                 inputs: torch.Tensor,
+                 stage: str = 'backbone',
+                 **kwargs) -> ForwardResults:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            inputs (torch.Tensor): Raw Inputs of the recognizer.
+            stage (str): Which stage to output the features.
+
+        Returns:
+            Union[tuple, torch.Tensor]: Features from ``backbone`` or ``neck``
+            or ``head`` forward.
+        """
+        feats, _ = self.extract_feat(inputs, stage=stage)
+        return feats
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: OptSampleList = None,
+                mode: str = 'tensor',
+                **kwargs) -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes:
+
+        - ``tensor``: Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - ``predict``: Forward and return the predictions, which are fully
+        processed to a list of :obj:`ActionDataSample`.
+        - ``loss``: Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape
+                (N, C, ...) in general.
+            data_samples (List[``ActionDataSample], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to ``tensor``.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of ``ActionDataSample``.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if mode == 'tensor':
+            return self._forward(inputs, **kwargs)
+        if mode == 'predict':
+            return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
diff --git a/mmaction/models/recognizers/recognizer2d.py b/mmaction/models/recognizers/recognizer2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..99e4e951ca76d55b998f13755ec0047b55b16810
--- /dev/null
+++ b/mmaction/models/recognizers/recognizer2d.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import SampleList
+from .base import BaseRecognizer
+
+
+@MODELS.register_module()
+class Recognizer2D(BaseRecognizer):
+    """2D recognizer model framework."""
+
+    def extract_feat(self,
+                     inputs: torch.Tensor,
+                     stage: str = 'neck',
+                     data_samples: SampleList = None,
+                     test_mode: bool = False) -> tuple:
+        """Extract features of different stages.
+
+        Args:
+            inputs (Tensor): The input data.
+            stage (str): Which stage to output the feature.
+                Defaults to ``neck``.
+            data_samples (List[:obj:`ActionDataSample`]): Action data
+                samples, which are only needed in training. Defaults to None.
+            test_mode: (bool): Whether in test mode. Defaults to False.
+
+        Returns:
+                Tensor: The extracted features.
+                dict: A dict recording the kwargs for downstream
+                    pipeline. These keys are usually included:
+                    ``num_segs``, ``fcn_test``, ``loss_aux``.
+        """
+
+        # Record the kwargs required by `loss` and `predict`.
+        loss_predict_kwargs = dict()
+
+        num_segs = inputs.shape[1]
+        loss_predict_kwargs['num_segs'] = num_segs
+
+        # [N, num_crops * num_segs, C, H, W] ->
+        # [N * num_crops * num_segs, C, H, W]
+        # `num_crops` is calculated by:
+        #   1) `twice_sample` in `SampleFrames`
+        #   2) `num_sample_positions` in `DenseSampleFrames`
+        #   3) `ThreeCrop/TenCrop` in `test_pipeline`
+        #   4) `num_clips` in `SampleFrames` or its subclass if `clip_len != 1`
+        inputs = inputs.view((-1, ) + inputs.shape[2:])
+
+        def forward_once(batch_imgs):
+            # Extract features through backbone.
+            if (hasattr(self.backbone, 'features')
+                    and self.backbone_from == 'torchvision'):
+                x = self.backbone.features(batch_imgs)
+            elif self.backbone_from == 'timm':
+                x = self.backbone.forward_features(batch_imgs)
+            elif self.backbone_from in ['mmcls', 'mmpretrain']:
+                x = self.backbone(batch_imgs)
+                if isinstance(x, tuple):
+                    assert len(x) == 1
+                    x = x[0]
+            else:
+                x = self.backbone(batch_imgs)
+
+            if self.backbone_from in ['torchvision', 'timm']:
+                if not self.feature_shape:
+                    # Transformer-based feature shape: B x L x C.
+                    if len(x.shape) == 3:
+                        self.feature_shape = 'NLC'
+                    # Resnet-based feature shape: B x C x Hs x Ws.
+                    elif len(x.shape) == 4:
+                        self.feature_shape = 'NCHW'
+
+                if self.feature_shape == 'NHWC':
+                    x = nn.AdaptiveAvgPool2d(1)(x.permute(0, 3, 1,
+                                                          2))  # B x C x 1 x 1
+                elif self.feature_shape == 'NCHW':
+                    x = nn.AdaptiveAvgPool2d(1)(x)  # B x C x 1 x 1
+                elif self.feature_shape == 'NLC':
+                    x = nn.AdaptiveAvgPool1d(1)(x.transpose(1, 2))  # B x C x 1
+
+                x = x.reshape((x.shape[0], -1))  # B x C
+                x = x.reshape(x.shape + (1, 1))  # B x C x 1 x 1
+            return x
+
+        # Check settings of `fcn_test`.
+        fcn_test = False
+        if test_mode:
+            if self.test_cfg is not None and self.test_cfg.get(
+                    'fcn_test', False):
+                fcn_test = True
+                num_segs = self.test_cfg.get('num_segs',
+                                             self.backbone.num_segments)
+            loss_predict_kwargs['fcn_test'] = fcn_test
+
+            # inference with batch size of `max_testing_views` if set
+            if self.test_cfg is not None and self.test_cfg.get(
+                    'max_testing_views', False):
+                max_testing_views = self.test_cfg.get('max_testing_views')
+                assert isinstance(max_testing_views, int)
+                # backbone specify num_segments
+                num_segments = self.backbone.get('num_segments')
+                if num_segments is not None:
+                    assert max_testing_views % num_segments == 0, \
+                        'make sure that max_testing_views is a multiple of ' \
+                        'num_segments, but got {max_testing_views} and '\
+                        '{num_segments}'
+
+                total_views = inputs.shape[0]
+                view_ptr = 0
+                feats = []
+                while view_ptr < total_views:
+                    batch_imgs = inputs[view_ptr:view_ptr + max_testing_views]
+                    feat = forward_once(batch_imgs)
+                    if self.with_neck:
+                        feat, _ = self.neck(feat)
+                    feats.append(feat)
+                    view_ptr += max_testing_views
+
+                def recursively_cat(feats):
+                    # recursively traverse feats until it's a tensor,
+                    # then concat
+                    out_feats = []
+                    for e_idx, elem in enumerate(feats[0]):
+                        batch_elem = [feat[e_idx] for feat in feats]
+                        if not isinstance(elem, torch.Tensor):
+                            batch_elem = recursively_cat(batch_elem)
+                        else:
+                            batch_elem = torch.cat(batch_elem)
+                        out_feats.append(batch_elem)
+
+                    return tuple(out_feats)
+
+                if isinstance(feats[0], tuple):
+                    x = recursively_cat(feats)
+                else:
+                    x = torch.cat(feats)
+            else:
+                x = forward_once(inputs)
+        else:
+            x = forward_once(inputs)
+
+        # Return features extracted through backbone.
+        if stage == 'backbone':
+            return x, loss_predict_kwargs
+
+        loss_aux = dict()
+        if self.with_neck:
+            # x is a tuple with multiple feature maps.
+            x = [
+                each.reshape((-1, num_segs) +
+                             each.shape[1:]).transpose(1, 2).contiguous()
+                for each in x
+            ]
+            x, loss_aux = self.neck(x, data_samples=data_samples)
+            if not fcn_test:
+                x = x.squeeze(2)
+                loss_predict_kwargs['num_segs'] = 1
+        elif fcn_test:
+            # full convolution (fcn) testing when no neck
+            # [N * num_crops * num_segs, C', H', W'] ->
+            # [N * num_crops, C', num_segs, H', W']
+            x = x.reshape((-1, num_segs) +
+                          x.shape[1:]).transpose(1, 2).contiguous()
+
+        loss_predict_kwargs['loss_aux'] = loss_aux
+
+        # Return features extracted through neck.
+        if stage == 'neck':
+            return x, loss_predict_kwargs
+
+        # Return raw logits through head.
+        if self.with_cls_head and stage == 'head':
+            # [N * num_crops, num_classes]
+            x = self.cls_head(x, **loss_predict_kwargs)
+            return x, loss_predict_kwargs
diff --git a/mmaction/models/recognizers/recognizer3d.py b/mmaction/models/recognizers/recognizer3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2af5c3a40794e8875f122dbec2091ce4ea50e3d
--- /dev/null
+++ b/mmaction/models/recognizers/recognizer3d.py
@@ -0,0 +1,115 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import Tensor
+
+from mmaction.registry import MODELS
+from mmaction.utils import OptSampleList
+from .base import BaseRecognizer
+
+
+@MODELS.register_module()
+class Recognizer3D(BaseRecognizer):
+    """3D recognizer model framework."""
+
+    def extract_feat(self,
+                     inputs: Tensor,
+                     stage: str = 'neck',
+                     data_samples: OptSampleList = None,
+                     test_mode: bool = False) -> tuple:
+        """Extract features of different stages.
+
+        Args:
+            inputs (torch.Tensor): The input data.
+            stage (str): Which stage to output the feature.
+                Defaults to ``'neck'``.
+            data_samples (list[:obj:`ActionDataSample`], optional): Action data
+                samples, which are only needed in training. Defaults to None.
+            test_mode (bool): Whether in test mode. Defaults to False.
+
+        Returns:
+                torch.Tensor: The extracted features.
+                dict: A dict recording the kwargs for downstream
+                    pipeline. These keys are usually included:
+                    ``loss_aux``.
+        """
+
+        # Record the kwargs required by `loss` and `predict`
+        loss_predict_kwargs = dict()
+
+        num_segs = inputs.shape[1]
+        # [N, num_crops, C, T, H, W] ->
+        # [N * num_crops, C, T, H, W]
+        # `num_crops` is calculated by:
+        #   1) `twice_sample` in `SampleFrames`
+        #   2) `num_sample_positions` in `DenseSampleFrames`
+        #   3) `ThreeCrop/TenCrop` in `test_pipeline`
+        #   4) `num_clips` in `SampleFrames` or its subclass if `clip_len != 1`
+        inputs = inputs.view((-1, ) + inputs.shape[2:])
+
+        # Check settings of test
+        if test_mode:
+            if self.test_cfg is not None:
+                loss_predict_kwargs['fcn_test'] = self.test_cfg.get(
+                    'fcn_test', False)
+            if self.test_cfg is not None and self.test_cfg.get(
+                    'max_testing_views', False):
+                max_testing_views = self.test_cfg.get('max_testing_views')
+                assert isinstance(max_testing_views, int)
+
+                total_views = inputs.shape[0]
+                assert num_segs == total_views, (
+                    'max_testing_views is only compatible '
+                    'with batch_size == 1')
+                view_ptr = 0
+                feats = []
+                while view_ptr < total_views:
+                    batch_imgs = inputs[view_ptr:view_ptr + max_testing_views]
+                    feat = self.backbone(batch_imgs)
+                    if self.with_neck:
+                        feat, _ = self.neck(feat)
+                    feats.append(feat)
+                    view_ptr += max_testing_views
+
+                def recursively_cat(feats):
+                    # recursively traverse feats until it's a tensor,
+                    # then concat
+                    out_feats = []
+                    for e_idx, elem in enumerate(feats[0]):
+                        batch_elem = [feat[e_idx] for feat in feats]
+                        if not isinstance(elem, torch.Tensor):
+                            batch_elem = recursively_cat(batch_elem)
+                        else:
+                            batch_elem = torch.cat(batch_elem)
+                        out_feats.append(batch_elem)
+
+                    return tuple(out_feats)
+
+                if isinstance(feats[0], tuple):
+                    x = recursively_cat(feats)
+                else:
+                    x = torch.cat(feats)
+            else:
+                x = self.backbone(inputs)
+                if self.with_neck:
+                    x, _ = self.neck(x)
+
+            return x, loss_predict_kwargs
+        else:
+            # Return features extracted through backbone
+            x = self.backbone(inputs)
+            if stage == 'backbone':
+                return x, loss_predict_kwargs
+
+            loss_aux = dict()
+            if self.with_neck:
+                x, loss_aux = self.neck(x, data_samples=data_samples)
+
+            # Return features extracted through neck
+            loss_predict_kwargs['loss_aux'] = loss_aux
+            if stage == 'neck':
+                return x, loss_predict_kwargs
+
+            # Return raw logits through head.
+            if self.with_cls_head and stage == 'head':
+                x = self.cls_head(x, **loss_predict_kwargs)
+                return x, loss_predict_kwargs
diff --git a/mmaction/models/recognizers/recognizer3d_mm.py b/mmaction/models/recognizers/recognizer3d_mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..12541c5e1c85b0fffaa48389f809a3a33bedb428
--- /dev/null
+++ b/mmaction/models/recognizers/recognizer3d_mm.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+import torch
+
+from mmaction.registry import MODELS
+from mmaction.utils import OptSampleList
+from .base import BaseRecognizer
+
+
+@MODELS.register_module()
+class MMRecognizer3D(BaseRecognizer):
+    """Multi-modal 3D recognizer model framework."""
+
+    def extract_feat(self,
+                     inputs: Dict[str, torch.Tensor],
+                     stage: str = 'backbone',
+                     data_samples: OptSampleList = None,
+                     test_mode: bool = False) -> Tuple:
+        """Extract features.
+
+        Args:
+            inputs (dict[str, torch.Tensor]): The multi-modal input data.
+            stage (str): Which stage to output the feature.
+                Defaults to ``'backbone'``.
+            data_samples (list[:obj:`ActionDataSample`], optional): Action data
+                samples, which are only needed in training. Defaults to None.
+            test_mode (bool): Whether in test mode. Defaults to False.
+
+        Returns:
+                tuple[torch.Tensor]: The extracted features.
+                dict: A dict recording the kwargs for downstream
+                    pipeline.
+        """
+        # [N, num_views, C, T, H, W] ->
+        # [N * num_views, C, T, H, W]
+        for m, m_data in inputs.items():
+            m_data = m_data.reshape((-1, ) + m_data.shape[2:])
+            inputs[m] = m_data
+
+        # Record the kwargs required by `loss` and `predict`
+        loss_predict_kwargs = dict()
+
+        x = self.backbone(**inputs)
+        if stage == 'backbone':
+            return x, loss_predict_kwargs
+
+        if self.with_cls_head and stage == 'head':
+            x = self.cls_head(x, **loss_predict_kwargs)
+            return x, loss_predict_kwargs
diff --git a/mmaction/models/recognizers/recognizer_audio.py b/mmaction/models/recognizers/recognizer_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..684f482e6eb16bf606a23cc7d42f119fbe3c8df6
--- /dev/null
+++ b/mmaction/models/recognizers/recognizer_audio.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch import Tensor
+
+from mmaction.registry import MODELS
+from .base import BaseRecognizer
+
+
+@MODELS.register_module()
+class RecognizerAudio(BaseRecognizer):
+    """Audio recognizer model framework."""
+
+    def extract_feat(self,
+                     batch_inputs: Tensor,
+                     stage: str = 'backbone',
+                     **kwargs) -> tuple:
+        """Extract features of different stages.
+
+        Args:
+            batch_inputs (Tensor): The input data.
+            stage (str): Which stage to output the feature.
+                Defaults to ``backbone``.
+
+        Returns:
+            Tensor: The extracted features.
+            dict: A dict recording the kwargs for downstream
+                pipeline. This will be an empty dict in audio recognizer.
+        """
+
+        # Record the kwargs required by `loss` and `predict`
+        loss_predict_kwargs = dict()
+        batch_inputs = batch_inputs.view((-1, ) + batch_inputs.shape[2:])
+
+        x = self.backbone(batch_inputs)
+
+        if stage == 'backbone':
+            return x, loss_predict_kwargs
+
+        if self.with_cls_head and stage == 'head':
+            x = self.cls_head(x, **loss_predict_kwargs)
+            return x, loss_predict_kwargs
diff --git a/mmaction/models/recognizers/recognizer_gcn.py b/mmaction/models/recognizers/recognizer_gcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a8e3df3ae1a1fc03542ee13327e034d7a6034f5
--- /dev/null
+++ b/mmaction/models/recognizers/recognizer_gcn.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+
+from mmaction.registry import MODELS
+from .base import BaseRecognizer
+
+
+@MODELS.register_module()
+class RecognizerGCN(BaseRecognizer):
+    """GCN-based recognizer for skeleton-based action recognition."""
+
+    def extract_feat(self,
+                     inputs: torch.Tensor,
+                     stage: str = 'backbone',
+                     **kwargs) -> Tuple:
+        """Extract features at the given stage.
+
+        Args:
+            inputs (torch.Tensor): The input skeleton with shape of
+                `(B, num_clips, num_person, clip_len, num_joints, 3 or 2)`.
+            stage (str): The stage to output the features.
+                Defaults to ``'backbone'``.
+
+        Returns:
+            tuple: THe extracted features and a dict recording the kwargs
+            for downstream pipeline, which is an empty dict for the
+            GCN-based recognizer.
+        """
+
+        # Record the kwargs required by `loss` and `predict`
+        loss_predict_kwargs = dict()
+
+        bs, nc = inputs.shape[:2]
+        inputs = inputs.reshape((bs * nc, ) + inputs.shape[2:])
+
+        x = self.backbone(inputs)
+
+        if stage == 'backbone':
+            return x, loss_predict_kwargs
+
+        if self.with_cls_head and stage == 'head':
+            x = self.cls_head(x, **loss_predict_kwargs)
+            return x, loss_predict_kwargs
diff --git a/mmaction/models/recognizers/recognizer_omni.py b/mmaction/models/recognizers/recognizer_omni.py
new file mode 100644
index 0000000000000000000000000000000000000000..df8154182678ead19a1eabe0ec009e6347bfe893
--- /dev/null
+++ b/mmaction/models/recognizers/recognizer_omni.py
@@ -0,0 +1,183 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Sequence, Union
+
+import torch
+from mmengine.model import BaseModel
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType, ForwardResults, SampleList
+
+
+@MODELS.register_module()
+class RecognizerOmni(BaseModel):
+    """An Omni-souce recognizer model framework for joint-training of image and
+    video recognition tasks.
+
+    The `backbone` and `cls_head` should be able to accept both images and
+    videos as inputs.
+    """
+
+    def __init__(self, backbone: ConfigType, cls_head: ConfigType,
+                 data_preprocessor: ConfigType) -> None:
+        super().__init__(data_preprocessor=data_preprocessor)
+        self.backbone = MODELS.build(backbone)
+        self.cls_head = MODELS.build(cls_head)
+
+    def forward(self, *data_samples, mode: str, **kwargs) -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes:
+
+        - ``tensor``: Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - ``predict``: Forward and return the predictions, which are fully
+        processed to a list of :obj:`ActionDataSample`.
+        - ``loss``: Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            data_samples: should be a sequence of ``SampleList`` if
+                ``mode="predict"`` or ``mode="loss"``. Each ``SampleList`` is
+                the annotation data of one data source.
+                It should be a single torch tensor if ``mode="tensor"``.
+            mode (str): Return what kind of value. Defaults to ``tensor``.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of ``ActionDataSample``.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+
+        if mode == 'loss' or mode == 'predict':
+            if mode == 'loss':
+                return self.loss(data_samples)
+            return self.predict(data_samples)
+
+        elif mode == 'tensor':
+
+            assert isinstance(data_samples, torch.Tensor)
+
+            data_ndim = data_samples.ndim
+            if data_ndim not in [4, 5]:
+                info = f'Input is a {data_ndim}D tensor. '
+                info += 'Only 4D (BCHW) or 5D (BCTHW) tensors are supported!'
+                raise ValueError(info)
+
+            return self._forward(data_samples, **kwargs)
+
+    def loss(self, data_samples: Sequence[SampleList]) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            data_samples (Sequence[SampleList]): a sequence of SampleList. Each
+                SampleList contains data samples from the same data source.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        loss_dict = {}
+        for idx, data in enumerate(data_samples):
+            inputs, data_samples = data['inputs'], data['data_samples']
+            feats = self.extract_feat(inputs)
+            loss_cls = self.cls_head.loss(feats, data_samples)
+            for key in loss_cls:
+                loss_dict[key + f'_{idx}'] = loss_cls[key]
+        return loss_dict
+
+    def predict(self, data_samples: Sequence[SampleList]) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            data_samples (Sequence[SampleList]): a sequence of SampleList. Each
+                SampleList contains data samples from the same data source.
+
+        Returns:
+            List[``ActionDataSample``]: Return the recognition results.
+            The returns value is ``ActionDataSample``, which usually contains
+            ``pred_scores``. And the ``pred_scores`` usually contains
+            following keys.
+
+                - item (torch.Tensor): Classification scores, has a shape
+                    (num_classes, )
+        """
+        assert len(data_samples) == 1
+        feats = self.extract_feat(data_samples[0]['inputs'], test_mode=True)
+        predictions = self.cls_head.predict(feats,
+                                            data_samples[0]['data_samples'])
+        return predictions
+
+    def _forward(self,
+                 inputs: torch.Tensor,
+                 stage: str = 'backbone',
+                 **kwargs) -> ForwardResults:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            inputs (torch.Tensor): Raw Inputs of the recognizer.
+            stage (str): Which stage to output the features.
+
+        Returns:
+            Union[tuple, torch.Tensor]: Features from ``backbone`` or ``head``
+            forward.
+        """
+        feats, _ = self.extract_feat(inputs, stage=stage)
+        return feats
+
+    def _run_forward(self, data: Union[dict, tuple, list],
+                     mode: str) -> Union[Dict[str, torch.Tensor], list]:
+        """Unpacks data for :meth:`forward`
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            mode (str): Mode of forward.
+        Returns:
+            dict or list: Results of training or testing mode.
+        """
+        if isinstance(data, dict):
+            data = [data]
+            results = self(*data, mode=mode)
+        elif isinstance(data, (list, tuple)):
+            results = self(*data, mode=mode)
+        else:
+            raise TypeError
+        return results
+
+    def extract_feat(self,
+                     inputs: torch.Tensor,
+                     stage: str = 'backbone',
+                     test_mode: bool = False) -> tuple:
+        """Extract features of different stages.
+
+        Args:
+            inputs (torch.Tensor): The input data.
+            stage (str): Which stage to output the feature.
+                Defaults to ``'backbone'``.
+            test_mode (bool): Whether in test mode. Defaults to False.
+
+        Returns:
+                torch.Tensor: The extracted features.
+                dict: A dict recording the kwargs for downstream
+                    pipeline. These keys are usually included:
+                    ``loss_aux``.
+        """
+
+        if len(inputs.shape) == 6:
+            inputs = inputs.view((-1, ) + inputs.shape[2:])
+
+        # Check settings of test
+        if test_mode:
+            x = self.backbone(inputs)
+            return x
+        else:
+            # Return features extracted through backbone
+            x = self.backbone(inputs)
+            if stage == 'backbone':
+                return x
+            x = self.cls_head(x)
+            return x
diff --git a/mmaction/models/roi_heads/__init__.py b/mmaction/models/roi_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ca01c608ed4a3473e46c35d5771d8367c61b078
--- /dev/null
+++ b/mmaction/models/roi_heads/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+try:
+    from mmdet.registry import MODELS as MMDET_MODELS
+
+    from .bbox_heads import BBoxHeadAVA
+    from .roi_extractors import SingleRoIExtractor3D
+    from .roi_head import AVARoIHead
+    from .shared_heads import ACRNHead, FBOHead, LFBInferHead
+
+    for module in [
+            AVARoIHead, BBoxHeadAVA, SingleRoIExtractor3D, ACRNHead, FBOHead,
+            LFBInferHead
+    ]:
+
+        MMDET_MODELS.register_module()(module)
+
+    __all__ = [
+        'AVARoIHead', 'BBoxHeadAVA', 'SingleRoIExtractor3D', 'ACRNHead',
+        'FBOHead', 'LFBInferHead'
+    ]
+
+except (ImportError, ModuleNotFoundError):
+    pass
diff --git a/mmaction/models/roi_heads/bbox_heads/__init__.py b/mmaction/models/roi_heads/bbox_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52a7c4aebabfb90ec004b5caab1307c5c5ddea47
--- /dev/null
+++ b/mmaction/models/roi_heads/bbox_heads/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_head import BBoxHeadAVA
+
+__all__ = ['BBoxHeadAVA']
diff --git a/mmaction/models/roi_heads/bbox_heads/bbox_head.py b/mmaction/models/roi_heads/bbox_heads/bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..069e09ba2ece75210694cffef9a50c7cbc141890
--- /dev/null
+++ b/mmaction/models/roi_heads/bbox_heads/bbox_head.py
@@ -0,0 +1,415 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+# Resolve cross-entropy function to support multi-target in Torch < 1.10
+#   This is a very basic 'hack', with minimal functionality to support the
+#   procedure under prior torch versions
+from packaging import version as pv
+from torch import Tensor
+
+from mmaction.structures.bbox import bbox_target
+from mmaction.utils import InstanceList
+
+if pv.parse(torch.__version__) < pv.parse('1.10'):
+
+    def cross_entropy_loss(input, target, reduction='None'):
+        input = input.log_softmax(dim=-1)  # Compute Log of Softmax
+        loss = -(input * target).sum(dim=-1)  # Compute Loss manually
+        if reduction.lower() == 'mean':
+            return loss.mean()
+        elif reduction.lower() == 'sum':
+            return loss.sum()
+        else:
+            return loss
+else:
+    cross_entropy_loss = F.cross_entropy
+
+
+class BBoxHeadAVA(nn.Module):
+    """Simplest RoI head, with only one fc layer for classification.
+
+    Args:
+        background_class (bool): Whether set class 0 as background class and
+            ignore it when calculate loss.
+        temporal_pool_type (str): The temporal pool type. Choices are ``avg``
+            or ``max``. Defaults to ``avg``.
+        spatial_pool_type (str): The spatial pool type. Choices are ``avg`` or
+            ``max``. Defaults to ``max``.
+        in_channels (int): The number of input channels. Defaults to 2048.
+        focal_alpha (float): The hyper-parameter alpha for Focal Loss.
+            When ``alpha == 1`` and ``gamma == 0``, Focal Loss degenerates to
+            BCELossWithLogits. Defaults to 1.
+        focal_gamma (float): The hyper-parameter gamma for Focal Loss.
+            When ``alpha == 1`` and ``gamma == 0``, Focal Loss degenerates to
+            BCELossWithLogits. Defaults to 0.
+        num_classes (int): The number of classes. Defaults to 81.
+        dropout_ratio (float): A float in ``[0, 1]``, indicates the
+            dropout_ratio. Defaults to 0.
+        dropout_before_pool (bool): Dropout Feature before spatial temporal
+            pooling. Defaults to True.
+        topk (int or Tuple[int]): Parameter for evaluating Top-K accuracy.
+            Defaults to ``(3, 5)``.
+        multilabel (bool): Whether used for a multilabel task.
+            Defaults to True.
+        mlp_head (bool): Whether to use an MLP as the classification head.
+            Defaults to False, i.e., using a single linear head.
+    """
+
+    def __init__(
+            self,
+            background_class: bool,
+            temporal_pool_type: str = 'avg',
+            spatial_pool_type: str = 'max',
+            in_channels: int = 2048,
+            focal_gamma: float = 0.,
+            focal_alpha: float = 1.,
+            num_classes: int = 81,  # First class reserved (BBox as pos/neg)
+            dropout_ratio: float = 0,
+            dropout_before_pool: bool = True,
+            topk: Union[int, Tuple[int]] = (3, 5),
+            multilabel: bool = True,
+            mlp_head: bool = False) -> None:
+        super(BBoxHeadAVA, self).__init__()
+        assert temporal_pool_type in ['max', 'avg']
+        assert spatial_pool_type in ['max', 'avg']
+        self.temporal_pool_type = temporal_pool_type
+        self.spatial_pool_type = spatial_pool_type
+
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+
+        self.dropout_ratio = dropout_ratio
+        self.dropout_before_pool = dropout_before_pool
+
+        self.multilabel = multilabel
+
+        self.focal_gamma = focal_gamma
+        self.focal_alpha = focal_alpha
+
+        self.background_class = background_class
+
+        if topk is None:
+            self.topk = ()
+        elif isinstance(topk, int):
+            self.topk = (topk, )
+        elif isinstance(topk, tuple):
+            assert all([isinstance(k, int) for k in topk])
+            self.topk = topk
+        else:
+            raise TypeError('topk should be int or tuple[int], '
+                            f'but get {type(topk)}')
+        # Class 0 is ignored when calculating accuracy,
+        #      so topk cannot be equal to num_classes.
+        assert all([k < num_classes for k in self.topk])
+
+        in_channels = self.in_channels
+        # Pool by default
+        if self.temporal_pool_type == 'avg':
+            self.temporal_pool = nn.AdaptiveAvgPool3d((1, None, None))
+        else:
+            self.temporal_pool = nn.AdaptiveMaxPool3d((1, None, None))
+        if self.spatial_pool_type == 'avg':
+            self.spatial_pool = nn.AdaptiveAvgPool3d((None, 1, 1))
+        else:
+            self.spatial_pool = nn.AdaptiveMaxPool3d((None, 1, 1))
+
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout(dropout_ratio)
+
+        if mlp_head:
+            self.fc_cls = nn.Sequential(
+                nn.Linear(in_channels, in_channels), nn.ReLU(),
+                nn.Linear(in_channels, num_classes))
+        else:
+            self.fc_cls = nn.Linear(in_channels, num_classes)
+
+    def init_weights(self) -> None:
+        """Initialize the classification head."""
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_normal_(m.weight)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Computes the classification logits given ROI features."""
+        if self.dropout_before_pool and self.dropout_ratio > 0:
+            x = self.dropout(x)
+
+        x = self.temporal_pool(x)
+        x = self.spatial_pool(x)
+
+        if not self.dropout_before_pool and self.dropout_ratio > 0:
+            x = self.dropout(x)
+
+        x = x.view(x.size(0), -1)
+        cls_score = self.fc_cls(x)
+        return cls_score
+
+    @staticmethod
+    def get_targets(sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict) -> tuple:
+        pos_proposals = [res.pos_priors for res in sampling_results]
+        neg_proposals = [res.neg_priors for res in sampling_results]
+        pos_gt_labels = [res.pos_gt_labels for res in sampling_results]
+        cls_targets = bbox_target(pos_proposals, neg_proposals, pos_gt_labels,
+                                  rcnn_train_cfg)
+        return cls_targets
+
+    @staticmethod
+    def get_recall_prec(pred_vec: Tensor, target_vec: Tensor) -> tuple:
+        """Computes the Recall/Precision for both multi-label and single label
+        scenarios.
+
+        Note that the computation calculates the micro average.
+
+        Note, that in both cases, the concept of correct/incorrect is the same.
+        Args:
+            pred_vec (tensor[N x C]): each element is either 0 or 1
+            target_vec (tensor[N x C]): each element is either 0 or 1 - for
+                single label it is expected that only one element is on (1)
+                although this is not enforced.
+        """
+        correct = pred_vec & target_vec
+        recall = correct.sum(1) / target_vec.sum(1).float()  # Enforce Float
+        prec = correct.sum(1) / (pred_vec.sum(1) + 1e-6)
+        return recall.mean(), prec.mean()
+
+    @staticmethod
+    def topk_to_matrix(probs: Tensor, k: int) -> Tensor:
+        """Converts top-k to binary matrix."""
+        topk_labels = probs.topk(k, 1, True, True)[1]
+        topk_matrix = probs.new_full(probs.size(), 0, dtype=torch.bool)
+        for i in range(probs.shape[0]):
+            topk_matrix[i, topk_labels[i]] = 1
+        return topk_matrix
+
+    def topk_accuracy(self,
+                      pred: Tensor,
+                      target: Tensor,
+                      thr: float = 0.5) -> tuple:
+        """Computes the Top-K Accuracies for both single and multi-label
+        scenarios."""
+        # Define Target vector:
+        target_bool = target > 0.5
+
+        # Branch on Multilabel for computing output classification
+        if self.multilabel:
+            pred = pred.sigmoid()
+        else:
+            pred = pred.softmax(dim=1)
+
+        # Compute at threshold (K=1 for single)
+        if self.multilabel:
+            pred_bool = pred > thr
+        else:
+            pred_bool = self.topk_to_matrix(pred, 1)
+        recall_thr, prec_thr = self.get_recall_prec(pred_bool, target_bool)
+
+        # Compute at various K
+        recalls_k, precs_k = [], []
+        for k in self.topk:
+            pred_bool = self.topk_to_matrix(pred, k)
+            recall, prec = self.get_recall_prec(pred_bool, target_bool)
+            recalls_k.append(recall)
+            precs_k.append(prec)
+
+        # Return all
+        return recall_thr, prec_thr, recalls_k, precs_k
+
+    def loss_and_target(self, cls_score: Tensor, rois: Tensor,
+                        sampling_results: List[SamplingResult],
+                        rcnn_train_cfg: ConfigDict, **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the bbox head.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        cls_targets = self.get_targets(sampling_results, rcnn_train_cfg)
+        labels, _ = cls_targets
+
+        losses = dict()
+        # Only use the cls_score
+        if cls_score is not None:
+            if self.background_class:
+                labels = labels[:, 1:]  # Get valid labels (ignore first one)
+                cls_score = cls_score[:, 1:]
+            pos_inds = torch.sum(labels, dim=-1) > 0
+            cls_score = cls_score[pos_inds]
+            labels = labels[pos_inds]
+
+            # Compute First Recall/Precisions
+            #   This has to be done first before normalising the label-space.
+            recall_thr, prec_thr, recall_k, prec_k = self.topk_accuracy(
+                cls_score, labels, thr=0.5)
+            losses['recall@thr=0.5'] = recall_thr
+            losses['prec@thr=0.5'] = prec_thr
+            for i, k in enumerate(self.topk):
+                losses[f'recall@top{k}'] = recall_k[i]
+                losses[f'prec@top{k}'] = prec_k[i]
+
+            # If Single-label, need to ensure that target labels sum to 1: ie
+            #   that they are valid probabilities.
+            if not self.multilabel and self.background_class:
+                labels = labels / labels.sum(dim=1, keepdim=True)
+
+            # Select Loss function based on single/multi-label
+            #   NB. Both losses auto-compute sigmoid/softmax on prediction
+            if self.multilabel:
+                loss_func = F.binary_cross_entropy_with_logits
+            else:
+                loss_func = cross_entropy_loss
+
+            # Compute loss
+            loss = loss_func(cls_score, labels, reduction='none')
+            pt = torch.exp(-loss)
+            F_loss = self.focal_alpha * (1 - pt)**self.focal_gamma * loss
+            losses['loss_action_cls'] = torch.mean(F_loss)
+
+        return dict(loss_bbox=losses, bbox_targets=cls_targets)
+
+    def predict_by_feat(self,
+                        rois: Tuple[Tensor],
+                        cls_scores: Tuple[Tensor],
+                        batch_img_metas: List[dict],
+                        rcnn_test_cfg: Optional[ConfigDict] = None,
+                        **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            rois (tuple[Tensor]): Tuple of boxes to be transformed.
+                Each has shape  (num_boxes, 5). last dimension 5 arrange as
+                (batch_index, x1, y1, x2, y2).
+            cls_scores (tuple[Tensor]): Tuple of box scores, each has shape
+                (num_boxes, num_classes + 1).
+            bbox_preds (tuple[Tensor]): Tuple of box energies / deltas, each
+                has shape (num_boxes, num_classes * 4).
+            batch_img_metas (list[dict]): List of image information.
+            rcnn_test_cfg (obj:`ConfigDict`, optional): `test_cfg` of R-CNN.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Instance segmentation
+            results of each image after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(
+                roi=rois[img_id],
+                cls_score=cls_scores[img_id],
+                img_meta=img_meta,
+                rcnn_test_cfg=rcnn_test_cfg,
+                **kwargs)
+            result_list.append(results)
+
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                roi: Tensor,
+                                cls_score: Tensor,
+                                img_meta: dict,
+                                rcnn_test_cfg: Optional[ConfigDict] = None,
+                                **kwargs) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tensor): Box energies / deltas.
+                has shape (num_boxes, num_classes * 4).
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image\
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results = InstanceData()
+
+        # might be used by testing w. augmentation
+        if isinstance(cls_score, list):
+            cls_score = sum(cls_score) / float(len(cls_score))
+
+        # Handle Multi/Single Label
+        if cls_score is not None:
+            if self.multilabel:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(dim=-1)
+        else:
+            scores = None
+
+        bboxes = roi[:, 1:]
+        assert bboxes.shape[-1] == 4
+
+        # First reverse the flip
+        img_h, img_w = img_meta['img_shape']
+        if img_meta.get('flip', False):
+            bboxes_ = bboxes.clone()
+            bboxes_[:, 0] = img_w - 1 - bboxes[:, 2]
+            bboxes_[:, 2] = img_w - 1 - bboxes[:, 0]
+            bboxes = bboxes_
+
+        # Then normalize the bbox to [0, 1]
+        bboxes[:, 0::2] /= img_w
+        bboxes[:, 1::2] /= img_h
+
+        def _bbox_crop_undo(bboxes, crop_quadruple):
+            decropped = bboxes.clone()
+
+            if crop_quadruple is not None:
+                x1, y1, tw, th = crop_quadruple
+                decropped[:, 0::2] = bboxes[..., 0::2] * tw + x1
+                decropped[:, 1::2] = bboxes[..., 1::2] * th + y1
+
+            return decropped
+
+        crop_quadruple = img_meta.get('crop_quadruple', np.array([0, 0, 1, 1]))
+        bboxes = _bbox_crop_undo(bboxes, crop_quadruple)
+
+        results.bboxes = bboxes
+        results.scores = scores
+
+        return results
diff --git a/mmaction/models/roi_heads/roi_extractors/__init__.py b/mmaction/models/roi_heads/roi_extractors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf008d90b16c7298139bf9941c050f0d302fa900
--- /dev/null
+++ b/mmaction/models/roi_heads/roi_extractors/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .single_straight3d import SingleRoIExtractor3D
+
+__all__ = ['SingleRoIExtractor3D']
diff --git a/mmaction/models/roi_heads/roi_extractors/single_straight3d.py b/mmaction/models/roi_heads/roi_extractors/single_straight3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d86b15bcdff8ae3a36a2b4fee8f9f9b40836da6
--- /dev/null
+++ b/mmaction/models/roi_heads/roi_extractors/single_straight3d.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+
+class SingleRoIExtractor3D(nn.Module):
+    """Extract RoI features from a single level feature map.
+
+    Args:
+        roi_layer_type (str): Specify the RoI layer type.
+            Defaults to ``RoIAlign``.
+        featmap_stride (int): Strides of input feature maps. Defaults to 16.
+        output_size (int or tuple): Size or (Height, Width). Defaults to 16.
+        sampling_ratio (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+            Defaults to 0.
+        pool_mode (str): pooling mode in each bin. Choices are ``avg`` or
+            ``max``. Defaults to ``avg``.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+            Defaults to True.
+        with_temporal_pool (bool): if True, avgpool the temporal dim.
+            Defaults to True.
+        with_global (bool): if True, concatenate the RoI feature with global
+            feature. Defaults to False.
+
+    Note that sampling_ratio, pool_mode, aligned only apply when roi_layer_type
+    is set as RoIAlign.
+    """
+
+    def __init__(self,
+                 roi_layer_type: str = 'RoIAlign',
+                 featmap_stride: int = 16,
+                 output_size: int = 16,
+                 sampling_ratio: int = 0,
+                 pool_mode: str = 'avg',
+                 aligned: bool = True,
+                 with_temporal_pool: bool = True,
+                 temporal_pool_mode: str = 'avg',
+                 with_global: bool = False) -> None:
+        super().__init__()
+        self.roi_layer_type = roi_layer_type
+        assert self.roi_layer_type in ['RoIPool', 'RoIAlign']
+        self.featmap_stride = featmap_stride
+        self.spatial_scale = 1. / self.featmap_stride
+
+        self.output_size = output_size
+        self.sampling_ratio = sampling_ratio
+        self.pool_mode = pool_mode
+        self.aligned = aligned
+
+        self.with_temporal_pool = with_temporal_pool
+        self.temporal_pool_mode = temporal_pool_mode
+
+        self.with_global = with_global
+
+        try:
+            from mmcv.ops import RoIAlign, RoIPool
+        except (ImportError, ModuleNotFoundError):
+            raise ImportError('Failed to import `RoIAlign` and `RoIPool` from '
+                              '`mmcv.ops`. The two modules will be used in '
+                              '`SingleRoIExtractor3D`! ')
+
+        if self.roi_layer_type == 'RoIPool':
+            self.roi_layer = RoIPool(self.output_size, self.spatial_scale)
+        else:
+            self.roi_layer = RoIAlign(
+                self.output_size,
+                self.spatial_scale,
+                sampling_ratio=self.sampling_ratio,
+                pool_mode=self.pool_mode,
+                aligned=self.aligned)
+        self.global_pool = nn.AdaptiveAvgPool2d(self.output_size)
+
+    def forward(self, feat: Union[Tensor, Tuple[Tensor]],
+                rois: Tensor) -> tuple:
+        """Forward function for extract roi features.
+
+        Args:
+            feat (Tensor or Tuple[Tensor]): The image features extracted by
+                the upstream network. The shape of feat is N, C, T, H, W.
+            rois (Tensor): Input RoIs, shape (k, 5).
+
+        Returns:
+            tuple: A tuple of roi features and global features.
+
+                - roi_feats (Tensor): Extracted bbox RoI features.
+                - feat (Tensor): Global features of the video clip.
+        """
+        if not isinstance(feat, tuple):
+            feat = (feat, )
+
+        if len(feat) >= 2:
+            maxT = max([x.shape[2] for x in feat])
+            max_shape = (maxT, ) + feat[0].shape[3:]
+            # resize each feat to the largest shape (w. nearest)
+            feat = [F.interpolate(x, max_shape).contiguous() for x in feat]
+
+        if self.with_temporal_pool:
+            if self.temporal_pool_mode == 'avg':
+                feat = [torch.mean(x, 2, keepdim=True) for x in feat]
+            elif self.temporal_pool_mode == 'max':
+                feat = [torch.max(x, 2, keepdim=True)[0] for x in feat]
+            else:
+                raise NotImplementedError
+
+        feat = torch.cat(feat, axis=1).contiguous()
+
+        roi_feats = []
+        for t in range(feat.size(2)):
+            frame_feat = feat[:, :, t].contiguous()
+            roi_feat = self.roi_layer(frame_feat, rois)
+            if self.with_global:
+                global_feat = self.global_pool(frame_feat.contiguous())
+                inds = rois[:, 0].type(torch.int64)
+                global_feat = global_feat[inds]
+                roi_feat = torch.cat([roi_feat, global_feat], dim=1)
+                roi_feat = roi_feat.contiguous()
+            roi_feats.append(roi_feat)
+
+        roi_feats = torch.stack(roi_feats, dim=2)
+        return roi_feats, feat
diff --git a/mmaction/models/roi_heads/roi_head.py b/mmaction/models/roi_heads/roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..15fc61e7aee951c614c6907feb33c300f0d80c86
--- /dev/null
+++ b/mmaction/models/roi_heads/roi_head.py
@@ -0,0 +1,206 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+from mmdet.models.roi_heads import StandardRoIHead
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.structures.bbox import bbox2roi
+from torch import Tensor
+
+from mmaction.utils import ConfigType, InstanceList, SampleList
+
+
+class AVARoIHead(StandardRoIHead):
+
+    def loss(self, x: Union[Tensor,
+                            Tuple[Tensor]], rpn_results_list: InstanceList,
+             data_samples: SampleList, **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (Tensor or Tuple[Tensor]): The image features extracted by
+                the upstream network.
+            rpn_results_list (List[:obj:`InstanceData`]): List of region
+                proposals.
+            data_samples (List[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(rpn_results_list) == len(data_samples)
+        batch_gt_instances = []
+        for data_sample in data_samples:
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        # assign gts and sample proposals
+        num_imgs = len(data_samples)
+        sampling_results = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(rpn_results,
+                                                      batch_gt_instances[i],
+                                                      None)
+            sampling_result = self.bbox_sampler.sample(assign_result,
+                                                       rpn_results,
+                                                       batch_gt_instances[i])
+            sampling_results.append(sampling_result)
+
+        # LFB needs meta_info: 'img_key'
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in data_samples
+        ]
+
+        losses = dict()
+        # bbox head forward and loss
+        bbox_results = self.bbox_loss(x, sampling_results, batch_img_metas)
+        losses.update(bbox_results['loss_bbox'])
+
+        return losses
+
+    def _bbox_forward(self, x: Union[Tensor, Tuple[Tensor]], rois: Tensor,
+                      batch_img_metas: List[dict], **kwargs) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            x (Tensor or Tuple[Tensor]): The image features extracted by
+                the upstream network.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            batch_img_metas (List[dict]): List of image information.
+
+        Returns:
+                dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_feats, global_feat = self.bbox_roi_extractor(x, rois)
+
+        if self.with_shared_head:
+            bbox_feats = self.shared_head(
+                bbox_feats,
+                feat=global_feat,
+                rois=rois,
+                img_metas=batch_img_metas)
+
+        cls_score = self.bbox_head(bbox_feats)
+
+        bbox_results = dict(cls_score=cls_score, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def bbox_loss(self, x: Union[Tensor, Tuple[Tensor]],
+                  sampling_results: List[SamplingResult],
+                  batch_img_metas: List[dict], **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (Tensor or Tuple[Tensor]): The image features extracted by
+                the upstream network.
+            sampling_results (List[SamplingResult]): Sampling results.
+            batch_img_metas (List[dict]): List of image information.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois, batch_img_metas)
+
+        bbox_loss_and_target = self.bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg)
+
+        bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+        return bbox_results
+
+    def predict(self, x: Union[Tensor,
+                               Tuple[Tensor]], rpn_results_list: InstanceList,
+                data_samples: SampleList, **kwargs) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (Tensor or Tuple[Tensor]): The image features extracted by
+                the upstream network.
+            rpn_results_list (List[:obj:`InstanceData`]): list of region
+                proposals.
+            data_samples (List[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+            List[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in data_samples
+        ]
+        if isinstance(x, tuple):
+            x_shape = x[0].shape
+        else:
+            x_shape = x.shape
+
+        assert x_shape[0] == 1, 'only accept 1 sample at test mode'
+        assert x_shape[0] == len(batch_img_metas) == len(rpn_results_list)
+
+        results_list = self.predict_bbox(
+            x, batch_img_metas, rpn_results_list, rcnn_test_cfg=self.test_cfg)
+
+        return results_list
+
+    def predict_bbox(self, x: Tuple[Tensor], batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process. Each item usually contains following
+            keys:
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+        """
+        proposals = [res.bboxes for res in rpn_results_list]
+        rois = bbox2roi(proposals)
+        bbox_results = self._bbox_forward(x, rois, batch_img_metas)
+
+        # split batch bbox prediction back to each image
+        cls_scores = bbox_results['cls_score']
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = rois.split(num_proposals_per_img, 0)
+        cls_scores = cls_scores.split(num_proposals_per_img, 0)
+
+        result_list = self.bbox_head.predict_by_feat(
+            rois=rois,
+            cls_scores=cls_scores,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=rcnn_test_cfg)
+
+        return result_list
diff --git a/mmaction/models/roi_heads/shared_heads/__init__.py b/mmaction/models/roi_heads/shared_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6869196d9556b5f3bd6c1db9f3deb60ddbc5504
--- /dev/null
+++ b/mmaction/models/roi_heads/shared_heads/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .acrn_head import ACRNHead
+from .fbo_head import FBOHead
+from .lfb_infer_head import LFBInferHead
+
+__all__ = ['ACRNHead', 'LFBInferHead', 'FBOHead']
diff --git a/mmaction/models/roi_heads/shared_heads/acrn_head.py b/mmaction/models/roi_heads/shared_heads/acrn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..939a3af78eb167f49f9e494b650fd14ab2255f39
--- /dev/null
+++ b/mmaction/models/roi_heads/shared_heads/acrn_head.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model.weight_init import constant_init, kaiming_init
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+# Note: All these heads take 5D Tensors as input (N, C, T, H, W)
+
+
+class ACRNHead(nn.Module):
+    """ACRN Head: Tile + 1x1 convolution + 3x3 convolution.
+
+    This module is proposed in
+    `Actor-Centric Relation Network
+    <https://arxiv.org/abs/1807.10982>`_
+
+    Args:
+        in_channels (int): The input channel.
+        out_channels (int): The output channel.
+        stride (int): The spatial stride.
+        num_convs (int): The number of 3x3 convolutions in ACRNHead.
+        conv_cfg (dict): Config for norm layers. Default: dict(type='Conv').
+        norm_cfg (dict):
+            Config for norm layers. required keys are `type` and
+            `requires_grad`. Default: dict(type='BN2d', requires_grad=True).
+        act_cfg (dict): Config for activate layers.
+            Default: dict(type='ReLU', inplace=True).
+        kwargs (dict): Other new arguments, to be compatible with MMDet update.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 num_convs=1,
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d', requires_grad=True),
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 **kwargs):
+
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+        self.num_convs = num_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.max_pool = nn.AdaptiveMaxPool3d(1)
+
+        self.conv1 = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        assert num_convs >= 1
+        self.conv2 = ConvModule(
+            out_channels,
+            out_channels,
+            kernel_size=(1, 3, 3),
+            stride=(1, stride, stride),
+            padding=(0, 1, 1),
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        convs = []
+        for _ in range(num_convs - 1):
+            conv = ConvModule(
+                out_channels,
+                out_channels,
+                kernel_size=(1, 3, 3),
+                padding=(0, 1, 1),
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            convs.append(conv)
+        self.convs = nn.ModuleList(convs)
+
+    def init_weights(self, **kwargs):
+        """Weight Initialization for ACRNHead."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                kaiming_init(m)
+            elif isinstance(m, _BatchNorm):
+                constant_init(m, 1)
+
+    def forward(self, x, feat, rois, **kwargs):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The extracted RoI feature.
+            feat (torch.Tensor): The context feature.
+            rois (torch.Tensor): The regions of interest.
+
+        Returns:
+            torch.Tensor: The RoI features that have interacted with context
+                feature.
+        """
+        # We use max pooling by default
+        x = self.max_pool(x)
+
+        h, w = feat.shape[-2:]
+        x_tile = x.repeat(1, 1, 1, h, w)
+
+        roi_inds = rois[:, 0].type(torch.long)
+        roi_gfeat = feat[roi_inds]
+
+        new_feat = torch.cat([x_tile, roi_gfeat], dim=1)
+        new_feat = self.conv1(new_feat)
+        new_feat = self.conv2(new_feat)
+
+        for conv in self.convs:
+            new_feat = conv(new_feat)
+
+        return new_feat
diff --git a/mmaction/models/roi_heads/shared_heads/fbo_head.py b/mmaction/models/roi_heads/shared_heads/fbo_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6199701288a1c0462cc8dbe3d05130dfce8465c
--- /dev/null
+++ b/mmaction/models/roi_heads/shared_heads/fbo_head.py
@@ -0,0 +1,397 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyrigho (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.logging import MMLogger
+from mmengine.model.weight_init import constant_init, kaiming_init
+from mmengine.runner import load_checkpoint
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from .lfb import LFB
+
+
+class NonLocalLayer(nn.Module):
+    """Non-local layer used in `FBONonLocal` is a variation of the vanilla non-
+    local block.
+
+    Args:
+        st_feat_channels (int): Channels of short-term features.
+        lt_feat_channels (int): Channels of long-term features.
+        latent_channels (int): Channels of latent features.
+        use_scale (bool): Whether to scale pairwise_weight by
+            `1/sqrt(latent_channels)`. Default: True.
+        pre_activate (bool): Whether to use the activation function before
+            upsampling. Default: False.
+        conv_cfg (Dict | None): The config dict for convolution layers. If
+            not specified, it will use `nn.Conv2d` for convolution layers.
+            Default: None.
+        norm_cfg (Dict | None): he config dict for normalization layers.
+            Default: None.
+        dropout_ratio (float, optional): Probability of dropout layer.
+            Default: 0.2.
+        zero_init_out_conv (bool): Whether to use zero initialization for
+            out_conv. Default: False.
+    """
+
+    def __init__(self,
+                 st_feat_channels,
+                 lt_feat_channels,
+                 latent_channels,
+                 num_st_feat,
+                 num_lt_feat,
+                 use_scale=True,
+                 pre_activate=True,
+                 pre_activate_with_ln=True,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 dropout_ratio=0.2,
+                 zero_init_out_conv=False):
+        super().__init__()
+        if conv_cfg is None:
+            conv_cfg = dict(type='Conv3d')
+        self.st_feat_channels = st_feat_channels
+        self.lt_feat_channels = lt_feat_channels
+        self.latent_channels = latent_channels
+        self.num_st_feat = num_st_feat
+        self.num_lt_feat = num_lt_feat
+        self.use_scale = use_scale
+        self.pre_activate = pre_activate
+        self.pre_activate_with_ln = pre_activate_with_ln
+        self.dropout_ratio = dropout_ratio
+        self.zero_init_out_conv = zero_init_out_conv
+
+        self.st_feat_conv = ConvModule(
+            self.st_feat_channels,
+            self.latent_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.lt_feat_conv = ConvModule(
+            self.lt_feat_channels,
+            self.latent_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.global_conv = ConvModule(
+            self.lt_feat_channels,
+            self.latent_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        if pre_activate:
+            self.ln = nn.LayerNorm([latent_channels, num_st_feat, 1, 1])
+        else:
+            self.ln = nn.LayerNorm([st_feat_channels, num_st_feat, 1, 1])
+
+        self.relu = nn.ReLU()
+
+        self.out_conv = ConvModule(
+            self.latent_channels,
+            self.st_feat_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        if self.dropout_ratio > 0:
+            self.dropout = nn.Dropout(self.dropout_ratio)
+
+    def init_weights(self, pretrained=None):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if isinstance(pretrained, str):
+            logger = MMLogger.get_current_instance()
+            logger.info(f'load model from: {pretrained}')
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv3d):
+                    kaiming_init(m)
+                elif isinstance(m, _BatchNorm):
+                    constant_init(m, 1)
+            if self.zero_init_out_conv:
+                constant_init(self.out_conv, 0, bias=0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, st_feat, lt_feat):
+        """Defines the computation performed at every call."""
+        n, c = st_feat.size(0), self.latent_channels
+        num_st_feat, num_lt_feat = self.num_st_feat, self.num_lt_feat
+
+        theta = self.st_feat_conv(st_feat)
+        theta = theta.view(n, c, num_st_feat)
+
+        phi = self.lt_feat_conv(lt_feat)
+        phi = phi.view(n, c, num_lt_feat)
+
+        g = self.global_conv(lt_feat)
+        g = g.view(n, c, num_lt_feat)
+
+        # (n, num_st_feat, c), (n, c, num_lt_feat)
+        # -> (n, num_st_feat, num_lt_feat)
+        theta_phi = torch.matmul(theta.permute(0, 2, 1), phi)
+        if self.use_scale:
+            theta_phi /= c**0.5
+
+        p = theta_phi.softmax(dim=-1)
+
+        # (n, c, num_lt_feat), (n, num_lt_feat, num_st_feat)
+        # -> (n, c, num_st_feat, 1, 1)
+        out = torch.matmul(g, p.permute(0, 2, 1)).view(n, c, num_st_feat, 1, 1)
+
+        # If need to activate it before out_conv, use relu here, otherwise
+        # use relu outside the non local layer.
+        if self.pre_activate:
+            if self.pre_activate_with_ln:
+                out = self.ln(out)
+            out = self.relu(out)
+
+        out = self.out_conv(out)
+
+        if not self.pre_activate:
+            out = self.ln(out)
+        if self.dropout_ratio > 0:
+            out = self.dropout(out)
+
+        return out
+
+
+class FBONonLocal(nn.Module):
+    """Non local feature bank operator.
+
+    Args:
+        st_feat_channels (int): Channels of short-term features.
+        lt_feat_channels (int): Channels of long-term features.
+        latent_channels (int): Channels of latent features.
+        num_st_feat (int): Number of short-term roi features.
+        num_lt_feat (int): Number of long-term roi features.
+        num_non_local_layers (int): Number of non-local layers, which is
+            at least 1. Default: 2.
+        st_feat_dropout_ratio (float): Probability of dropout layer for
+            short-term features. Default: 0.2.
+        lt_feat_dropout_ratio (float): Probability of dropout layer for
+            long-term features. Default: 0.2.
+        pre_activate (bool): Whether to use the activation function before
+            upsampling in non local layers. Default: True.
+        zero_init_out_conv (bool): Whether to use zero initialization for
+            out_conv in NonLocalLayer. Default: False.
+    """
+
+    def __init__(self,
+                 st_feat_channels,
+                 lt_feat_channels,
+                 latent_channels,
+                 num_st_feat,
+                 num_lt_feat,
+                 num_non_local_layers=2,
+                 st_feat_dropout_ratio=0.2,
+                 lt_feat_dropout_ratio=0.2,
+                 pre_activate=True,
+                 zero_init_out_conv=False,
+                 **kwargs):
+        super().__init__()
+        assert num_non_local_layers >= 1, (
+            'At least one non_local_layer is needed.')
+        self.st_feat_channels = st_feat_channels
+        self.lt_feat_channels = lt_feat_channels
+        self.latent_channels = latent_channels
+        self.num_st_feat = num_st_feat
+        self.num_lt_feat = num_lt_feat
+        self.num_non_local_layers = num_non_local_layers
+        self.st_feat_dropout_ratio = st_feat_dropout_ratio
+        self.lt_feat_dropout_ratio = lt_feat_dropout_ratio
+        self.pre_activate = pre_activate
+        self.zero_init_out_conv = zero_init_out_conv
+
+        self.st_feat_conv = nn.Conv3d(
+            st_feat_channels, latent_channels, kernel_size=1)
+        self.lt_feat_conv = nn.Conv3d(
+            lt_feat_channels, latent_channels, kernel_size=1)
+
+        if self.st_feat_dropout_ratio > 0:
+            self.st_feat_dropout = nn.Dropout(self.st_feat_dropout_ratio)
+
+        if self.lt_feat_dropout_ratio > 0:
+            self.lt_feat_dropout = nn.Dropout(self.lt_feat_dropout_ratio)
+
+        if not self.pre_activate:
+            self.relu = nn.ReLU()
+
+        self.non_local_layers = []
+        for idx in range(self.num_non_local_layers):
+            layer_name = f'non_local_layer_{idx + 1}'
+            self.add_module(
+                layer_name,
+                NonLocalLayer(
+                    latent_channels,
+                    latent_channels,
+                    latent_channels,
+                    num_st_feat,
+                    num_lt_feat,
+                    pre_activate=self.pre_activate,
+                    zero_init_out_conv=self.zero_init_out_conv))
+            self.non_local_layers.append(layer_name)
+
+    def init_weights(self, pretrained=None):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if isinstance(pretrained, str):
+            logger = MMLogger.get_current_instance()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            kaiming_init(self.st_feat_conv)
+            kaiming_init(self.lt_feat_conv)
+            for layer_name in self.non_local_layers:
+                non_local_layer = getattr(self, layer_name)
+                non_local_layer.init_weights(pretrained=pretrained)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, st_feat, lt_feat):
+        """Defines the computation performed at every call."""
+        # prepare st_feat
+        st_feat = self.st_feat_conv(st_feat)
+        if self.st_feat_dropout_ratio > 0:
+            st_feat = self.st_feat_dropout(st_feat)
+
+        # prepare lt_feat
+        lt_feat = self.lt_feat_conv(lt_feat)
+        if self.lt_feat_dropout_ratio > 0:
+            lt_feat = self.lt_feat_dropout(lt_feat)
+
+        # fuse short-term and long-term features in NonLocal Layer
+        for layer_name in self.non_local_layers:
+            identity = st_feat
+            non_local_layer = getattr(self, layer_name)
+            nl_out = non_local_layer(st_feat, lt_feat)
+            nl_out = identity + nl_out
+            if not self.pre_activate:
+                nl_out = self.relu(nl_out)
+            st_feat = nl_out
+
+        return nl_out
+
+
+class FBOAvg(nn.Module):
+    """Avg pool feature bank operator."""
+
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.avg_pool = nn.AdaptiveAvgPool3d((1, None, None))
+
+    def init_weights(self, pretrained=None):
+        # FBOAvg has no parameters to be initialized.
+        pass
+
+    def forward(self, st_feat, lt_feat):
+        out = self.avg_pool(lt_feat)
+        return out
+
+
+class FBOMax(nn.Module):
+    """Max pool feature bank operator."""
+
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.max_pool = nn.AdaptiveMaxPool3d((1, None, None))
+
+    def init_weights(self, pretrained=None):
+        """FBOMax has no parameters to be initialized."""
+        pass
+
+    def forward(self, st_feat, lt_feat):
+        """Defines the computation performed at every call."""
+        out = self.max_pool(lt_feat)
+        return out
+
+
+class FBOHead(nn.Module):
+    """Feature Bank Operator Head.
+
+    Add feature bank operator for the spatiotemporal detection model to fuse
+    short-term features and long-term features.
+    Args:
+        lfb_cfg (Dict): The config dict for LFB which is used to sample
+            long-term features.
+        fbo_cfg (Dict): The config dict for feature bank operator (FBO). The
+            type of fbo is also in the config dict and supported fbo type is
+            `fbo_dict`.
+        temporal_pool_type (str): The temporal pool type. Choices are 'avg' or
+            'max'. Default: 'avg'.
+        spatial_pool_type (str): The spatial pool type. Choices are 'avg' or
+            'max'. Default: 'max'.
+    """
+
+    fbo_dict = {'non_local': FBONonLocal, 'avg': FBOAvg, 'max': FBOMax}
+
+    def __init__(self,
+                 lfb_cfg,
+                 fbo_cfg,
+                 temporal_pool_type='avg',
+                 spatial_pool_type='max'):
+        super().__init__()
+        fbo_type = fbo_cfg.pop('type', 'non_local')
+        assert fbo_type in FBOHead.fbo_dict
+        assert temporal_pool_type in ['max', 'avg']
+        assert spatial_pool_type in ['max', 'avg']
+
+        self.lfb_cfg = copy.deepcopy(lfb_cfg)
+        self.fbo_cfg = copy.deepcopy(fbo_cfg)
+
+        self.lfb = LFB(**self.lfb_cfg)
+        self.fbo = self.fbo_dict[fbo_type](**self.fbo_cfg)
+
+        # Pool by default
+        if temporal_pool_type == 'avg':
+            self.temporal_pool = nn.AdaptiveAvgPool3d((1, None, None))
+        else:
+            self.temporal_pool = nn.AdaptiveMaxPool3d((1, None, None))
+        if spatial_pool_type == 'avg':
+            self.spatial_pool = nn.AdaptiveAvgPool3d((None, 1, 1))
+        else:
+            self.spatial_pool = nn.AdaptiveMaxPool3d((None, 1, 1))
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in the module.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Default: None.
+        """
+        self.fbo.init_weights(pretrained=pretrained)
+
+    def sample_lfb(self, rois, img_metas):
+        """Sample long-term features for each ROI feature."""
+        inds = rois[:, 0].type(torch.int64)
+        lt_feat_list = []
+        for ind in inds:
+            lt_feat_list.append(self.lfb[img_metas[ind]['img_key']])
+        lt_feat = torch.stack(lt_feat_list, dim=0)
+        # [N, lfb_channels, window_size * max_num_feat_per_step]
+        lt_feat = lt_feat.permute(0, 2, 1).contiguous()
+        return lt_feat.unsqueeze(-1).unsqueeze(-1)
+
+    def forward(self, x, rois, img_metas, **kwargs):
+        """Defines the computation performed at every call."""
+        # [N, C, 1, 1, 1]
+        st_feat = self.temporal_pool(x)
+        st_feat = self.spatial_pool(st_feat)
+        identity = st_feat
+
+        # [N, C, window_size * num_feat_per_step, 1, 1]
+        lt_feat = self.sample_lfb(rois, img_metas).to(st_feat.device)
+
+        fbo_feat = self.fbo(st_feat, lt_feat)
+
+        out = torch.cat([identity, fbo_feat], dim=1)
+        return out
diff --git a/mmaction/models/roi_heads/shared_heads/lfb.py b/mmaction/models/roi_heads/shared_heads/lfb.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fcb406f91fd1e138ad343f2733349ca3a2eb5c7
--- /dev/null
+++ b/mmaction/models/roi_heads/shared_heads/lfb.py
@@ -0,0 +1,194 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import io
+import os
+import os.path as osp
+import warnings
+
+import torch
+import torch.distributed as dist
+from mmengine.dist import get_dist_info
+
+try:
+    import lmdb
+    lmdb_imported = True
+except (ImportError, ModuleNotFoundError):
+    lmdb_imported = False
+
+
+class LFB:
+    """Long-Term Feature Bank (LFB). LFB is proposed in `Long-Term Feature
+    Banks for Detailed Video Understanding <https://arxiv.org/abs/1812.05038>`_
+    The ROI features of videos are stored in the feature bank. The feature bank
+    was generated by inferring with a lfb infer config. Formally, LFB is a Dict
+    whose keys are video IDs and its values are also Dicts whose keys are
+    timestamps in seconds. Example of LFB:
+
+    .. code-block:: Python
+        {
+            '0f39OWEqJ24': {
+                901: tensor([[ 1.2760,  1.1965,  ...,  0.0061, -0.0639],
+                    [-0.6320,  0.3794,  ..., -1.2768,  0.5684],
+                    [ 0.2535,  1.0049,  ...,  0.4906,  1.2555],
+                    [-0.5838,  0.8549,  ..., -2.1736,  0.4162]]),
+                ...
+                1705: tensor([[-1.0169, -1.1293,  ...,  0.6793, -2.0540],
+                    [ 1.2436, -0.4555,  ...,  0.2281, -0.8219],
+                    [ 0.2815, -0.0547,  ..., -0.4199,  0.5157]]),
+                ...
+            },
+            'xmqSaQPzL1E': {
+                ...
+            },
+            ...
+        }
+    Args:
+        lfb_prefix_path (str): The storage path of lfb.
+        max_num_sampled_feat (int): The max number of sampled features.
+            Default: 5.
+        window_size (int): Window size of sampling long term feature.
+            Default: 60.
+        lfb_channels (int): Number of the channels of the features stored
+            in LFB. Default: 2048.
+        dataset_modes (tuple[str] | str): Load LFB of datasets with different
+            modes, such as training, validation, testing datasets. If you don't
+            do cross validation during training, just load the training dataset
+            i.e. setting `dataset_modes = ('train')`.
+            Default: ('train', 'val').
+        device (str): Where to load lfb. Choices are 'gpu', 'cpu' and 'lmdb'.
+            A 1.65GB half-precision ava lfb (including training and validation)
+            occupies about 2GB GPU memory. Default: 'gpu'.
+        lmdb_map_size (int): Map size of lmdb. Default: 4e9.
+        construct_lmdb (bool): Whether to construct lmdb. If you have
+            constructed lmdb of lfb, you can set to False to skip the
+            construction. Default: True.
+    """
+
+    def __init__(self,
+                 lfb_prefix_path,
+                 max_num_sampled_feat=5,
+                 window_size=60,
+                 lfb_channels=2048,
+                 dataset_modes=('train', 'val'),
+                 device='gpu',
+                 lmdb_map_size=4e9,
+                 construct_lmdb=True):
+        if not osp.exists(lfb_prefix_path):
+            raise ValueError(
+                f'lfb prefix path {lfb_prefix_path} does not exist!')
+        self.lfb_prefix_path = lfb_prefix_path
+        self.max_num_sampled_feat = max_num_sampled_feat
+        self.window_size = window_size
+        self.lfb_channels = lfb_channels
+        if not isinstance(dataset_modes, tuple):
+            assert isinstance(dataset_modes, str)
+            dataset_modes = (dataset_modes, )
+        self.dataset_modes = dataset_modes
+        self.device = device
+
+        rank, world_size = get_dist_info()
+        # Loading LFB
+        if self.device == 'gpu':
+            if 'LOCAL_RANK' in os.environ:
+                local_rank = int(os.environ['LOCAL_RANK'])
+            else:
+                gpus_per_node = torch.cuda.device_count()
+                local_rank = rank % gpus_per_node
+
+            self.load_lfb(f'cuda:{local_rank}')
+        elif self.device == 'cpu':
+            if world_size > 1:
+                warnings.warn(
+                    'If distributed training is used with multi-GPUs, lfb '
+                    'will be loaded multiple times on RAM. In this case, '
+                    "'lmdb' is recommended.", UserWarning)
+            self.load_lfb('cpu')
+        elif self.device == 'lmdb':
+            assert lmdb_imported, (
+                'Please install `lmdb` to load lfb on lmdb!')
+            self.lmdb_map_size = lmdb_map_size
+            self.construct_lmdb = construct_lmdb
+            self.lfb_lmdb_path = osp.normpath(
+                osp.join(self.lfb_prefix_path, 'lmdb'))
+
+            if rank == 0 and self.construct_lmdb:
+                print('Constructing LFB lmdb...')
+                self.load_lfb_on_lmdb()
+
+            # Synchronizes all processes to make sure lfb lmdb exist.
+            if world_size > 1:
+                dist.barrier()
+            self.lmdb_env = lmdb.open(self.lfb_lmdb_path, readonly=True)
+        else:
+            raise ValueError("Device must be 'gpu', 'cpu' or 'lmdb', ",
+                             f'but get {self.device}.')
+
+    def load_lfb(self, map_location):
+        self.lfb = {}
+        for dataset_mode in self.dataset_modes:
+            lfb_path = osp.normpath(
+                osp.join(self.lfb_prefix_path, f'lfb_{dataset_mode}.pkl'))
+            print(f'Loading LFB from {lfb_path}...')
+            self.lfb.update(torch.load(lfb_path, map_location=map_location))
+
+        for video_id in self.lfb:
+            video_features = self.lfb[video_id]
+            for sec in video_features:
+                if isinstance(video_features[sec], (list, tuple)):
+                    video_features[sec] = torch.stack(video_features[sec])
+            self.lfb[video_id] = video_features
+        print(f'LFB has been loaded on {map_location}.')
+
+    def load_lfb_on_lmdb(self):
+        lfb = {}
+        for dataset_mode in self.dataset_modes:
+            lfb_path = osp.normpath(
+                osp.join(self.lfb_prefix_path, f'lfb_{dataset_mode}.pkl'))
+            lfb.update(torch.load(lfb_path, map_location='cpu'))
+
+        lmdb_env = lmdb.open(self.lfb_lmdb_path, map_size=self.lmdb_map_size)
+        for key, value in lfb.items():
+            txn = lmdb_env.begin(write=True)
+            buff = io.BytesIO()
+            torch.save(value, buff)
+            buff.seek(0)
+            txn.put(key.encode(), buff.read())
+            txn.commit()
+            buff.close()
+
+        print(f'LFB lmdb has been constructed on {self.lfb_lmdb_path}!')
+
+    def sample_long_term_features(self, video_id, timestamp):
+        if self.device == 'lmdb':
+            with self.lmdb_env.begin(write=False) as txn:
+                buf = txn.get(video_id.encode())
+                video_features = torch.load(io.BytesIO(buf))
+        else:
+            video_features = self.lfb[video_id]
+
+        # Sample long term features.
+        window_size, K = self.window_size, self.max_num_sampled_feat
+        start = timestamp - (window_size // 2)
+        lt_feats = torch.zeros(window_size, K, self.lfb_channels)
+
+        for idx, sec in enumerate(range(start, start + window_size)):
+            if sec in video_features:
+                # `num_feat` is the number of roi features in this second.
+                feat = video_features[sec]
+                num_feat = feat.shape[0]
+
+                # Sample some roi features randomly.
+                random_lfb_indices = torch.randperm(num_feat)[:K]
+                lt_feats[idx, :num_feat] = feat[random_lfb_indices]
+
+        # [window_size * max_num_sampled_feat, lfb_channels]
+        return lt_feats.reshape(-1, self.lfb_channels)
+
+    def __getitem__(self, img_key):
+        """Sample long term features like `lfb['0f39OWEqJ24,0902']` where `lfb`
+        is a instance of class LFB."""
+        video_id, timestamp = img_key.split(',')
+        return self.sample_long_term_features(video_id, int(timestamp))
+
+    def __len__(self):
+        """The number of videos whose ROI features are stored in LFB."""
+        return len(self.lfb)
diff --git a/mmaction/models/roi_heads/shared_heads/lfb_infer_head.py b/mmaction/models/roi_heads/shared_heads/lfb_infer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6801e455b1a43800fd643fc9ca9476d472adc756
--- /dev/null
+++ b/mmaction/models/roi_heads/shared_heads/lfb_infer_head.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+# Note: All these heads take 5D Tensors as input (N, C, T, H, W)
+
+
+class LFBInferHead(nn.Module):
+    """Long-Term Feature Bank Infer Head.
+
+    This head is used to derive and save the LFB without affecting the input.
+    Args:
+        lfb_prefix_path (str): The prefix path to store the lfb.
+        dataset_mode (str, optional): Which dataset to be inferred. Choices are
+            'train', 'val' or 'test'. Default: 'train'.
+        use_half_precision (bool, optional): Whether to store the
+            half-precision roi features. Default: True.
+        temporal_pool_type (str): The temporal pool type. Choices are 'avg' or
+            'max'. Default: 'avg'.
+        spatial_pool_type (str): The spatial pool type. Choices are 'avg' or
+            'max'. Default: 'max'.
+    """
+
+    def __init__(self,
+                 lfb_prefix_path,
+                 dataset_mode='train',
+                 use_half_precision=True,
+                 temporal_pool_type='avg',
+                 spatial_pool_type='max'):
+        super().__init__()
+        rank, _ = mmengine.dist.get_dist_info()
+        if rank == 0:
+            if not osp.exists(lfb_prefix_path):
+                print(f'lfb prefix path {lfb_prefix_path} does not exist. '
+                      f'Creating the folder...')
+                mmengine.mkdir_or_exist(lfb_prefix_path)
+            print('\nInferring LFB...')
+
+        assert temporal_pool_type in ['max', 'avg']
+        assert spatial_pool_type in ['max', 'avg']
+        self.lfb_prefix_path = lfb_prefix_path
+        self.dataset_mode = dataset_mode
+        self.use_half_precision = use_half_precision
+
+        # Pool by default
+        if temporal_pool_type == 'avg':
+            self.temporal_pool = nn.AdaptiveAvgPool3d((1, None, None))
+        else:
+            self.temporal_pool = nn.AdaptiveMaxPool3d((1, None, None))
+        if spatial_pool_type == 'avg':
+            self.spatial_pool = nn.AdaptiveAvgPool3d((None, 1, 1))
+        else:
+            self.spatial_pool = nn.AdaptiveMaxPool3d((None, 1, 1))
+
+        self.all_features = []
+        self.all_metadata = []
+
+    def init_weights(self, pretrained=None):
+        """LFBInferHead has no parameters to be initialized."""
+        pass
+
+    def forward(self, x, rois, img_metas, **kwargs):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The extracted RoI feature.
+            rois (torch.Tensor): The regions of interest.
+            img_metas (List[dict]): The meta information of the data.
+
+        Returns:
+            torch.Tensor: The RoI features that have interacted with context
+        """
+        # [N, C, 1, 1, 1]
+        features = self.temporal_pool(x)
+        features = self.spatial_pool(features)
+        if self.use_half_precision:
+            features = features.half()
+
+        inds = rois[:, 0].type(torch.int64)
+        for ind in inds:
+            self.all_metadata.append(img_metas[ind]['img_key'])
+        self.all_features += list(features)
+
+        # Return the input directly and doesn't affect the input.
+        return x
+
+    def __del__(self):
+        assert len(self.all_features) == len(self.all_metadata), (
+            'features and metadata are not equal in length!')
+
+        rank, world_size = mmengine.dist.get_dist_info()
+        if world_size > 1:
+            dist.barrier()
+
+        _lfb = {}
+        for feature, metadata in zip(self.all_features, self.all_metadata):
+            video_id, timestamp = metadata.split(',')
+            timestamp = int(timestamp)
+
+            if video_id not in _lfb:
+                _lfb[video_id] = {}
+            if timestamp not in _lfb[video_id]:
+                _lfb[video_id][timestamp] = []
+
+            _lfb[video_id][timestamp].append(torch.squeeze(feature))
+
+        _lfb_file_path = osp.normpath(
+            osp.join(self.lfb_prefix_path,
+                     f'_lfb_{self.dataset_mode}_{rank}.pkl'))
+        torch.save(_lfb, _lfb_file_path)
+        print(f'{len(self.all_features)} features from {len(_lfb)} videos '
+              f'on GPU {rank} have been stored in {_lfb_file_path}.')
+
+        # Synchronizes all processes to make sure all gpus have stored their
+        # roi features
+        if world_size > 1:
+            dist.barrier()
+        if rank > 0:
+            return
+
+        print('Gathering all the roi features...')
+
+        lfb = {}
+        for rank_id in range(world_size):
+            _lfb_file_path = osp.normpath(
+                osp.join(self.lfb_prefix_path,
+                         f'_lfb_{self.dataset_mode}_{rank_id}.pkl'))
+
+            # Since each frame will only be distributed to one GPU,
+            # the roi features on the same timestamp of the same video are all
+            # on the same GPU
+            _lfb = torch.load(_lfb_file_path)
+            for video_id in _lfb:
+                if video_id not in lfb:
+                    lfb[video_id] = _lfb[video_id]
+                else:
+                    lfb[video_id].update(_lfb[video_id])
+
+            osp.os.remove(_lfb_file_path)
+
+        lfb_file_path = osp.normpath(
+            osp.join(self.lfb_prefix_path, f'lfb_{self.dataset_mode}.pkl'))
+        torch.save(lfb, lfb_file_path)
+        print(f'LFB has been constructed in {lfb_file_path}!')
diff --git a/mmaction/models/similarity/__init__.py b/mmaction/models/similarity/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c15d0fd8130d01d0258b34a396463f336e7be49
--- /dev/null
+++ b/mmaction/models/similarity/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .adapters import SimpleMeanAdapter, TransformerAdapter
+from .clip_similarity import CLIPSimilarity
+
+__all__ = ['CLIPSimilarity', 'TransformerAdapter', 'SimpleMeanAdapter']
diff --git a/mmaction/models/similarity/adapters.py b/mmaction/models/similarity/adapters.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0cba26c41883d8e56d9c3d6cf288a493bcb0048
--- /dev/null
+++ b/mmaction/models/similarity/adapters.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmaction.registry import MODELS
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Perform quick gelu."""
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+    """"ResidualAttentionBlock.
+
+    Args:
+        d_model (int): The dimension of the model.
+        n_head (int): The number of heads.
+        attn_mask (torch.Tensor, optional): The attention mask.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: Optional[torch.Tensor] = None) -> None:
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor) -> torch.Tensor:
+        """Perform attention."""
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+    """"ResidualAttentionBlock.
+
+    Args:
+        width (int): The width of transformer.
+        heads (int): The number of heads of transformer.
+        layers (int): The number of layers of transformer.
+        attn_mask (torch.Tensor, optional): The attention mask.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: Optional[torch.Tensor] = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        return self.resblocks(x)
+
+
+@MODELS.register_module()
+class TransformerAdapter(BaseModule):
+    """"Transformer adapter, modified from github.com/openai/CLIP.
+
+    Args:
+        num_segs (int): The number of segments.
+        transformer_width (int): The width of transformer.
+        transformer_heads (int): The number of heads of transformer.
+        transformer_layers (int): The number of layers of transformer.
+    """
+
+    def __init__(self, num_segs: int, transformer_width: int,
+                 transformer_heads: int, transformer_layers: int) -> None:
+        super(TransformerAdapter, self).__init__()
+        self.num_segs = num_segs
+
+        self.positional_embedding = nn.Parameter(
+            torch.empty(num_segs, transformer_width))
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads)
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+
+        nn.init.normal_(self.positional_embedding, std=0.01)
+
+        proj_std = (self.transformer.width**-0.5) * (
+            (2 * self.transformer.layers)**-0.5)
+        attn_std = self.transformer.width**-0.5
+        fc_std = (2 * self.transformer.width)**-0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        b, seq_length, c = x.size()
+
+        x_original = x
+        x = x + self.positional_embedding
+        x = x.transpose(0, 1)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.transpose(0, 1)  # LND -> NLD
+        x = x.type(x_original.dtype) + x_original
+        return x.mean(dim=1)
+
+
+@MODELS.register_module()
+class SimpleMeanAdapter(BaseModule):
+    """Average features adapter.
+
+    Args:
+        dim (int): The dimension to perform averaging. Defaults to 1.
+    """
+
+    def __init__(self, dim: Union[int, Tuple[int]] = 1) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        return x.mean(dim=self.dim)
diff --git a/mmaction/models/similarity/clip_similarity.py b/mmaction/models/similarity/clip_similarity.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7427c6d30f6ea36b2836f3ec6be13848c558650
--- /dev/null
+++ b/mmaction/models/similarity/clip_similarity.py
@@ -0,0 +1,175 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict, List, Tuple
+
+import torch
+from mmengine.dist import all_gather, get_rank
+from mmengine.model import BaseModel
+from mmengine.structures import InstanceData
+
+from mmaction.registry import MODELS
+from mmaction.utils import ForwardResults, OptSampleList
+
+
+class GatherLayer(torch.autograd.Function):
+    """Gather tensors from all process, supporting backward propagation."""
+
+    @staticmethod
+    def forward(ctx: Any, input: torch.Tensor) -> Tuple[List]:
+        ctx.save_for_backward(input)
+        output = all_gather(input)
+        return tuple(output)
+
+    @staticmethod
+    def backward(ctx: Any, *grads: torch.Tensor) -> torch.Tensor:
+        input, = ctx.saved_tensors
+        grad_out = torch.zeros_like(input)
+        grad_out[:] = grads[get_rank()]
+        return grad_out
+
+
+@MODELS.register_module()
+class CLIPSimilarity(BaseModel):
+    """CLIP-based similarity model.
+
+    Args:
+        clip_arch (str): The architecture of the clip model.
+            Supported choices are `'ViT-B/32'`, `'ViT-B/16'`,
+            `'ViT-L/14'` and `'ViT-L/14@336px'`.
+        data_preprocessor (dict): The pre-process config.
+        adapter (dict): The 3D adapter config.
+        to_float32 (bool): Whether to convert the dtype of params of clip
+            model to float32.
+        frozen_layers: Layers to be frozen (all params fixed). -1 means
+            not freezing any parameters. Defaults to -1.
+        loss (dict): The config of loss. Defaults to
+            `dict(type='CrossEntropyLoss', loss_weight=0.5)`.
+    """
+
+    def __init__(
+        self,
+        clip_arch: str,
+        data_preprocessor: Dict[str, Dict],
+        adapter: Dict,
+        to_float32: bool = False,
+        frozen_layers: int = -1,
+        loss: Dict = dict(type='CrossEntropyLoss', loss_weight=0.5)
+    ) -> None:
+        super(CLIPSimilarity,
+              self).__init__(data_preprocessor=data_preprocessor)
+
+        try:
+            import clip
+        except ImportError:
+            raise ImportError('Please run `pip install '
+                              'git+https://github.com/openai/CLIP.git` '
+                              'to install clip first. ')
+
+        self.clip = clip.load(clip_arch, device='cpu')[0]
+        if to_float32:
+            self.clip.float()
+        self.loss = MODELS.build(loss)
+        self.adapter = MODELS.build(adapter)
+        self.frozen_layers = frozen_layers
+        self._freeze_stages()
+
+    def encode_video(self, video: torch.Tensor) -> torch.Tensor:
+        """Encode video."""
+        b, n, c, h, w = video.shape
+        video = video.view(-1, c, h, w)
+        frames_features = self.encode_image(video)
+        frames_features = frames_features.view(b, n, -1)
+        video_features = self.adapter(frames_features)
+        return video_features
+
+    def encode_image(self, image: torch.Tensor) -> torch.Tensor:
+        """Encode image."""
+        return self.clip.encode_image(image)
+
+    def encode_text(self, text: torch.Tensor) -> torch.Tensor:
+        """Encode text."""
+        return self.clip.encode_text(text)
+
+    def extract_feat(self,
+                     inputs: Dict[str, torch.Tensor],
+                     norm: bool = True) -> Tuple:
+        """Extract features."""
+        text_inputs = inputs['text']
+        video_inputs = inputs['imgs']
+        text_features = self.encode_text(text_inputs)
+        video_features = self.encode_video(video_inputs)
+
+        if norm:
+            text_features = text_features / text_features.norm(
+                dim=-1, keepdim=True)
+            video_features = video_features / video_features.norm(
+                dim=-1, keepdim=True)
+
+        return video_features, text_features
+
+    def forward(self,
+                inputs: Dict[str, torch.Tensor],
+                data_samples: OptSampleList = None,
+                mode: str = 'tensor') -> ForwardResults:
+        """Forward function."""
+
+        if mode == 'tensor':
+            return self.extract_feat(inputs, norm=False)
+
+        elif mode == 'loss':
+            video_features, text_features = self.extract_feat(inputs)
+            video_features = torch.cat(
+                GatherLayer.apply(video_features), dim=0)
+            text_features = torch.cat(GatherLayer.apply(text_features), dim=0)
+
+            logit_scale = self.clip.logit_scale.exp()
+            logits_per_video = logit_scale * video_features @ text_features.t()
+            logits_per_text = logits_per_video.t()
+
+            labels = torch.arange(logits_per_video.shape[0]).to(
+                logit_scale.device)
+
+            sim_loss_v2t = self.loss(logits_per_video, labels)
+            sim_loss_t2v = self.loss(logits_per_text, labels)
+
+            losses = dict()
+            losses['sim_loss_v2t'] = sim_loss_v2t
+            losses['sim_loss_t2v'] = sim_loss_t2v
+            return losses
+
+        elif mode == 'predict':
+            video_features, text_features = self.extract_feat(inputs)
+            for ds, vf, tf in zip(data_samples, video_features, text_features):
+                features = InstanceData(video_feature=vf, text_feature=tf)
+                ds.features = features
+            return data_samples
+
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    def train(self, mode: bool = True) -> None:
+        """Set the optimization status when training."""
+        super().train(mode)
+        self._freeze_stages()
+
+    def _freeze_stages(self) -> None:
+        """Prevent all the parameters from being optimized before
+        ``self.frozen_layers``."""
+
+        if self.frozen_layers >= 0:
+            top_layers = [
+                'ln_final', 'text_projection', 'logit_scale', 'visual.ln_post',
+                'visual.proj'
+            ]
+            mid_layers = [
+                'visual.transformer.resblocks', 'transformer.resblocks'
+            ]
+
+            for name, param in self.clip.named_parameters():
+                if any(name.find(n) == 0 for n in top_layers):
+                    continue
+                elif any(name.find(n) == 0 for n in mid_layers):
+                    layer_n = int(name.split('.resblocks.')[1].split('.')[0])
+                    if layer_n >= self.frozen_layers:
+                        continue
+                param.requires_grad = False
diff --git a/mmaction/models/task_modules/__init__.py b/mmaction/models/task_modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..583fc6b73523f71252b26c354510bff4ff52b826
--- /dev/null
+++ b/mmaction/models/task_modules/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+try:
+    from mmdet.registry import TASK_UTILS as MMDET_TASK_UTILS
+
+    from .assigners import MaxIoUAssignerAVA
+
+    MMDET_TASK_UTILS.register_module()(MaxIoUAssignerAVA)
+
+    __all__ = ['MaxIoUAssignerAVA']
+
+except (ImportError, ModuleNotFoundError):
+    pass
diff --git a/mmaction/models/task_modules/assigners/__init__.py b/mmaction/models/task_modules/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..68447e5fbcb7fca754f3b742b3a965a124705d1b
--- /dev/null
+++ b/mmaction/models/task_modules/assigners/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .max_iou_assigner_ava import MaxIoUAssignerAVA
+
+__all__ = ['MaxIoUAssignerAVA']
diff --git a/mmaction/models/task_modules/assigners/max_iou_assigner_ava.py b/mmaction/models/task_modules/assigners/max_iou_assigner_ava.py
new file mode 100644
index 0000000000000000000000000000000000000000..50e7861dd12303340c570e7843866e7b281df06f
--- /dev/null
+++ b/mmaction/models/task_modules/assigners/max_iou_assigner_ava.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdet.models.task_modules import AssignResult, MaxIoUAssigner
+from torch import Tensor
+
+
+class MaxIoUAssignerAVA(MaxIoUAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float | tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each
+            gt). Defaults to 0.
+        gt_max_assign_all (bool): Whether to assign all bboxes with the
+            same highest overlap with some gt to that gt. Defaults to True.
+    """
+
+    # The function is overridden, to handle the case that gt_label is not
+    # int
+    def assign_wrt_overlaps(self, overlaps: Tensor,
+                            gt_labels: Tensor) -> AssignResult:
+        """Assign w.r.t. the overlaps of bboxes with gts.
+
+        Args:
+            overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes,
+                shape(k, n).
+            gt_labels (Tensor): Labels of k gt_bboxes, shape
+                (k, num_classes).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_gts, num_bboxes = overlaps.size(0), overlaps.size(1)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = overlaps.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_bboxes, ))
+            assigned_labels = overlaps.new_full((num_bboxes, ),
+                                                -1,
+                                                dtype=torch.long)
+            if num_gts == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=max_overlaps,
+                labels=assigned_labels)
+
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
+
+        # 2. assign negative: below
+        # the negative inds are set to be 0
+        if isinstance(self.neg_iou_thr, float):
+            assigned_gt_inds[(max_overlaps >= 0)
+                             & (max_overlaps < self.neg_iou_thr)] = 0
+        elif isinstance(self.neg_iou_thr, tuple):
+            assert len(self.neg_iou_thr) == 2
+            assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0])
+                             & (max_overlaps < self.neg_iou_thr[1])] = 0
+
+        # 3. assign positive: above positive IoU threshold
+        pos_inds = max_overlaps >= self.pos_iou_thr
+        assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+        if self.match_low_quality:
+            # Low-quality matching will overwrite the assigned_gt_inds
+            # assigned in Step 3. Thus, the assigned gt might not be the
+            # best one for prediction.
+            # For example, if bbox A has 0.9 and 0.8 iou with GT bbox
+            # 1 & 2, bbox 1 will be assigned as the best target for bbox A
+            # in step 3. However, if GT bbox 2's gt_argmax_overlaps = A,
+            # bbox A's assigned_gt_inds will be overwritten to be bbox B.
+            # This might be the reason that it is not used in ROI Heads.
+            for i in range(num_gts):
+                if gt_max_overlaps[i] >= self.min_pos_iou:
+                    if self.gt_max_assign_all:
+                        max_iou_inds = overlaps[i, :] == gt_max_overlaps[i]
+                        assigned_gt_inds[max_iou_inds] = i + 1
+                    else:
+                        assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        # consider multi-class case (AVA)
+        assert len(gt_labels[0]) > 1
+        assigned_labels = assigned_gt_inds.new_zeros(
+            (num_bboxes, len(gt_labels[0])), dtype=torch.float32)
+
+        # If not assigned, labels will be all 0
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=assigned_gt_inds,
+            max_overlaps=max_overlaps,
+            labels=assigned_labels)
diff --git a/mmaction/models/utils/__init__.py b/mmaction/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2643ace9853e306471ae11afa772ae89df3c332
--- /dev/null
+++ b/mmaction/models/utils/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .blending_utils import (BaseMiniBatchBlending, CutmixBlending,
+                             MixupBlending, RandomBatchAugment)
+from .gcn_utils import *  # noqa: F401,F403
+from .graph import Graph
+
+__all__ = [
+    'BaseMiniBatchBlending', 'CutmixBlending', 'MixupBlending', 'Graph',
+    'RandomBatchAugment'
+]
diff --git a/mmaction/models/utils/__pycache__/__init__.cpython-312.pyc b/mmaction/models/utils/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ee61235f7fbf8e466e86e69feff10923ddbd9f3
Binary files /dev/null and b/mmaction/models/utils/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/models/utils/__pycache__/blending_utils.cpython-312.pyc b/mmaction/models/utils/__pycache__/blending_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7644130e42f04f21ae265975350e1f0852eeafa4
Binary files /dev/null and b/mmaction/models/utils/__pycache__/blending_utils.cpython-312.pyc differ
diff --git a/mmaction/models/utils/__pycache__/embed.cpython-312.pyc b/mmaction/models/utils/__pycache__/embed.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c827b327c1acff0eea08db58c1c82f37ecdaf709
Binary files /dev/null and b/mmaction/models/utils/__pycache__/embed.cpython-312.pyc differ
diff --git a/mmaction/models/utils/__pycache__/gcn_utils.cpython-312.pyc b/mmaction/models/utils/__pycache__/gcn_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed21f003fd1a2afa68891b8456badd6920548c2c
Binary files /dev/null and b/mmaction/models/utils/__pycache__/gcn_utils.cpython-312.pyc differ
diff --git a/mmaction/models/utils/__pycache__/graph.cpython-312.pyc b/mmaction/models/utils/__pycache__/graph.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19e433dbda2643244009aa3badc93a3398bf9bdd
Binary files /dev/null and b/mmaction/models/utils/__pycache__/graph.cpython-312.pyc differ
diff --git a/mmaction/models/utils/blending_utils.py b/mmaction/models/utils/blending_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0ae9afb6665f38409a9304d82acae6c7ae73ca8
--- /dev/null
+++ b/mmaction/models/utils/blending_utils.py
@@ -0,0 +1,263 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from functools import partial
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmengine.utils import digit_version
+from torch.distributions.beta import Beta
+
+from mmaction.registry import MODELS
+from mmaction.utils import SampleList
+
+if digit_version(torch.__version__) < digit_version('1.8.0'):
+    floor_div = torch.floor_divide
+else:
+    floor_div = partial(torch.div, rounding_mode='floor')
+
+__all__ = ['BaseMiniBatchBlending', 'MixupBlending', 'CutmixBlending']
+
+
+class BaseMiniBatchBlending(metaclass=ABCMeta):
+    """Base class for Image Aliasing.
+
+    Args:
+        num_classes (int): Number of classes.
+    """
+
+    def __init__(self, num_classes: int) -> None:
+        self.num_classes = num_classes
+
+    @abstractmethod
+    def do_blending(self, imgs: torch.Tensor, label: torch.Tensor,
+                    **kwargs) -> Tuple:
+        """Blending images process."""
+        raise NotImplementedError
+
+    def __call__(self, imgs: torch.Tensor, batch_data_samples: SampleList,
+                 **kwargs) -> Tuple:
+        """Blending data in a mini-batch.
+
+        Images are float tensors with the shape of (B, N, C, H, W) for 2D
+        recognizers or (B, N, C, T, H, W) for 3D recognizers.
+
+        Besides, labels are converted from hard labels to soft labels.
+        Hard labels are integer tensors with the shape of (B, ) and all of the
+        elements are in the range [0, num_classes - 1].
+        Soft labels (probability distribution over classes) are float tensors
+        with the shape of (B, num_classes) and all of the elements are in
+        the range [0, 1].
+
+        Args:
+            imgs (torch.Tensor): Model input images, float tensor with the
+                shape of (B, N, C, H, W) or (B, N, C, T, H, W).
+            batch_data_samples (List[:obj:`ActionDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_label`.
+
+        Returns:
+            mixed_imgs (torch.Tensor): Blending images, float tensor with the
+                same shape of the input imgs.
+            batch_data_samples (List[:obj:`ActionDataSample`]): The modified
+                batch data samples. ``gt_label`` in each data sample are
+                converted from a hard label to a blended soft label, float
+                tensor with the shape of (num_classes, ) and all elements are
+                in range [0, 1].
+        """
+        label = [x.gt_label for x in batch_data_samples]
+        # single-label classification
+        if label[0].size(0) == 1:
+            label = torch.tensor(label, dtype=torch.long).to(imgs.device)
+            one_hot_label = F.one_hot(label, num_classes=self.num_classes)
+        # multi-label classification
+        else:
+            one_hot_label = torch.stack(label)
+
+        mixed_imgs, mixed_label = self.do_blending(imgs, one_hot_label,
+                                                   **kwargs)
+
+        for label_item, sample in zip(mixed_label, batch_data_samples):
+            sample.set_gt_label(label_item)
+
+        return mixed_imgs, batch_data_samples
+
+
+@MODELS.register_module()
+class MixupBlending(BaseMiniBatchBlending):
+    """Implementing Mixup in a mini-batch.
+
+    This module is proposed in `mixup: Beyond Empirical Risk Minimization
+    <https://arxiv.org/abs/1710.09412>`_.
+    Code Reference https://github.com/open-mmlab/mmclassification/blob/master/mmcls/models/utils/mixup.py # noqa
+
+    Args:
+        num_classes (int): The number of classes.
+        alpha (float): Parameters for Beta distribution.
+    """
+
+    def __init__(self, num_classes: int, alpha: float = .2) -> None:
+        super().__init__(num_classes=num_classes)
+        self.beta = Beta(alpha, alpha)
+
+    def do_blending(self, imgs: torch.Tensor, label: torch.Tensor,
+                    **kwargs) -> Tuple:
+        """Blending images with mixup.
+
+        Args:
+            imgs (torch.Tensor): Model input images, float tensor with the
+                shape of (B, N, C, H, W) or (B, N, C, T, H, W).
+            label (torch.Tensor): One hot labels, integer tensor with the shape
+                of (B, num_classes).
+
+        Returns:
+            tuple: A tuple of blended images and labels.
+        """
+        assert len(kwargs) == 0, f'unexpected kwargs for mixup {kwargs}'
+
+        lam = self.beta.sample()
+        batch_size = imgs.size(0)
+        rand_index = torch.randperm(batch_size)
+
+        mixed_imgs = lam * imgs + (1 - lam) * imgs[rand_index, :]
+        mixed_label = lam * label + (1 - lam) * label[rand_index, :]
+
+        return mixed_imgs, mixed_label
+
+
+@MODELS.register_module()
+class CutmixBlending(BaseMiniBatchBlending):
+    """Implementing Cutmix in a mini-batch.
+
+    This module is proposed in `CutMix: Regularization Strategy to Train Strong
+    Classifiers with Localizable Features <https://arxiv.org/abs/1905.04899>`_.
+    Code Reference https://github.com/clovaai/CutMix-PyTorch
+
+    Args:
+        num_classes (int): The number of classes.
+        alpha (float): Parameters for Beta distribution.
+    """
+
+    def __init__(self, num_classes: int, alpha: float = .2) -> None:
+        super().__init__(num_classes=num_classes)
+        self.beta = Beta(alpha, alpha)
+
+    @staticmethod
+    def rand_bbox(img_size: torch.Size, lam: torch.Tensor) -> Tuple:
+        """Generate a random boudning box."""
+        w = img_size[-1]
+        h = img_size[-2]
+        cut_rat = torch.sqrt(1. - lam)
+        cut_w = torch.tensor(int(w * cut_rat))
+        cut_h = torch.tensor(int(h * cut_rat))
+
+        # uniform
+        cx = torch.randint(w, (1, ))[0]
+        cy = torch.randint(h, (1, ))[0]
+
+        bbx1 = torch.clamp(cx - floor_div(cut_w, 2), 0, w)
+        bby1 = torch.clamp(cy - floor_div(cut_h, 2), 0, h)
+        bbx2 = torch.clamp(cx + floor_div(cut_w, 2), 0, w)
+        bby2 = torch.clamp(cy + floor_div(cut_h, 2), 0, h)
+
+        return bbx1, bby1, bbx2, bby2
+
+    def do_blending(self, imgs: torch.Tensor, label: torch.Tensor,
+                    **kwargs) -> Tuple:
+        """Blending images with cutmix.
+
+        Args:
+            imgs (torch.Tensor): Model input images, float tensor with the
+                shape of (B, N, C, H, W) or (B, N, C, T, H, W).
+            label (torch.Tensor): One hot labels, integer tensor with the shape
+                of (B, num_classes).
+
+        Returns:
+            tuple: A tuple of blended images and labels.
+        """
+
+        assert len(kwargs) == 0, f'unexpected kwargs for cutmix {kwargs}'
+
+        batch_size = imgs.size(0)
+        rand_index = torch.randperm(batch_size)
+        lam = self.beta.sample()
+
+        bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.size(), lam)
+        imgs[:, ..., bby1:bby2, bbx1:bbx2] = imgs[rand_index, ..., bby1:bby2,
+                                                  bbx1:bbx2]
+        lam = 1 - (1.0 * (bbx2 - bbx1) * (bby2 - bby1) /
+                   (imgs.size()[-1] * imgs.size()[-2]))
+
+        label = lam * label + (1 - lam) * label[rand_index, :]
+
+        return imgs, label
+
+
+@MODELS.register_module()
+class RandomBatchAugment(BaseMiniBatchBlending):
+    """Randomly choose one batch augmentation to apply.
+
+    Args:
+        augments (dict | list): configs of batch
+            augmentations.
+        probs (float | List[float] | None): The probabilities of each batch
+            augmentations. If None, choose evenly. Defaults to None.
+
+    Example:
+        >>> augments_cfg = [
+        ...     dict(type='CutmixBlending', alpha=1., num_classes=10),
+        ...     dict(type='MixupBlending', alpha=1., num_classes=10)
+        ... ]
+        >>> batch_augment = RandomBatchAugment(augments_cfg, probs=[0.5, 0.3])
+        >>> imgs = torch.randn(16, 3, 8, 32, 32)
+        >>> label = torch.randint(0, 10, (16, ))
+        >>> imgs, label = batch_augment(imgs, label)
+
+    .. note ::
+
+        To decide which batch augmentation will be used, it picks one of
+        ``augments`` based on the probabilities. In the example above, the
+        probability to use CutmixBlending is 0.5, to use MixupBlending is 0.3,
+        and to do nothing is 0.2.
+    """
+
+    def __init__(self,
+                 augments: Union[dict, list],
+                 probs: Optional[Union[float, List[float]]] = None) -> None:
+        if not isinstance(augments, (tuple, list)):
+            augments = [augments]
+
+        self.augments = []
+        for aug in augments:
+            assert isinstance(aug, dict), \
+                f'blending augment config must be a dict. Got {type(aug)}'
+            self.augments.append(MODELS.build(aug))
+
+        self.num_classes = augments[0].get('num_classes')
+
+        if isinstance(probs, float):
+            probs = [probs]
+
+        if probs is not None:
+            assert len(augments) == len(probs), \
+                '``augments`` and ``probs`` must have same lengths. ' \
+                f'Got {len(augments)} vs {len(probs)}.'
+            assert sum(probs) <= 1, \
+                'The total probability of batch augments exceeds 1.'
+            self.augments.append(None)
+            probs.append(1 - sum(probs))
+
+        self.probs = probs
+
+    def do_blending(self, imgs: torch.Tensor, label: torch.Tensor,
+                    **kwargs) -> Tuple:
+        """Randomly apply batch augmentations to the batch inputs and batch
+        data samples."""
+        aug_index = np.random.choice(len(self.augments), p=self.probs)
+        aug = self.augments[aug_index]
+
+        if aug is not None:
+            return aug.do_blending(imgs, label, **kwargs)
+        else:
+            return imgs, label
diff --git a/mmaction/models/utils/embed.py b/mmaction/models/utils/embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e8aa44fb58e7520e9293a7d338503f151cc5eef
--- /dev/null
+++ b/mmaction/models/utils/embed.py
@@ -0,0 +1,234 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+from mmengine.utils import to_3tuple
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding adaptively to the input.
+
+    This module can make input get fully covered by filter
+    you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad
+    zero around input. The "corner"  mode would pad zero
+    to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel. Default: 1.
+        stride (int | tuple): Stride of the filter. Default: 1.
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+        super().__init__()
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_3tuple(kernel_size)
+        stride = to_3tuple(stride)
+        dilation = to_3tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        """Calculate the padding size of input.
+
+        Args:
+            input_shape (:obj:`torch.Size`): arrange as (H, W).
+
+        Returns:
+            Tuple[int]: The padding size along the
+            original H and W directions
+        """
+        input_t, input_h, input_w = input_shape
+        kernel_d, kernel_h, kernel_w = self.kernel_size
+        stride_d, stride_h, stride_w = self.stride
+        output_d = math.ceil(input_t / stride_d)
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_d = max((output_d - 1) * stride_d +
+                    (kernel_d - 1) * self.dilation[0] + 1 - input_t, 0)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[1] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[2] + 1 - input_w, 0)
+        return pad_d, pad_h, pad_w
+
+    def forward(self, x):
+        """Add padding to `x`
+
+        Args:
+            x (Tensor): Input tensor has shape (B, C, H, W).
+
+        Returns:
+            Tensor: The tensor with adaptive padding
+        """
+        pad_d, pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_d > 0 or pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h, 0, pad_d])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2,
+                    pad_w - pad_w // 2,
+                    pad_h // 2,
+                    pad_h - pad_h // 2,
+                    pad_d // 2,
+                    pad_d - pad_d // 2,
+                ])
+        return x
+
+
+class PatchEmbed3D(BaseModule):
+    """Video to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The type of convolution
+            to generate patch embedding. Default: "Conv3d".
+        kernel_size (int): The kernel_size of embedding conv.
+            Default: (2, 4, 4).
+        stride (int): The slide stride of embedding conv.
+            Default: (2, 4, 4).
+        padding (int | tuple | string): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only works when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv3d',
+                 kernel_size=(2, 4, 4),
+                 stride=(2, 4, 4),
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_3tuple(kernel_size)
+        stride = to_3tuple(stride)
+        dilation = to_3tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adaptive_padding = None
+        padding = to_3tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_3tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # e.g. when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adaptive_padding:
+                pad_d, pad_h, pad_w = self.adaptive_padding.get_pad_shape(
+                    input_size)
+                input_t, input_h, input_w = input_size
+                input_t = input_t + pad_d
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_t, input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv3d.html
+            t_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            h_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            w_out = (input_size[2] + 2 * padding[2] - dilation[2] *
+                     (kernel_size[2] - 1) - 1) // stride[2] + 1
+            self.init_out_size = (t_out, h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, T, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, out_t * out_h * out_w, embed_dims)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (out_t, out_h, out_w).
+        """
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3], x.shape[4])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
diff --git a/mmaction/models/utils/gcn_utils.py b/mmaction/models/utils/gcn_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c423a3e3def5f417e8c5f403df15fe9901e8227
--- /dev/null
+++ b/mmaction/models/utils/gcn_utils.py
@@ -0,0 +1,421 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+
+
+class unit_gcn(BaseModule):
+    """The basic unit of graph convolutional network.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        A (torch.Tensor): The adjacency matrix defined in the graph
+            with shape of `(num_subsets, num_nodes, num_nodes)`.
+        adaptive (str): The strategy for adapting the weights of the
+            adjacency matrix. Defaults to ``'importance'``.
+        conv_pos (str): The position of the 1x1 2D conv.
+            Defaults to ``'pre'``.
+        with_res (bool): Whether to use residual connection.
+            Defaults to False.
+        norm (str): The name of norm layer. Defaults to ``'BN'``.
+        act (str): The name of activation layer. Defaults to ``'Relu'``.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 A: torch.Tensor,
+                 adaptive: str = 'importance',
+                 conv_pos: str = 'pre',
+                 with_res: bool = False,
+                 norm: str = 'BN',
+                 act: str = 'ReLU',
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_subsets = A.size(0)
+
+        assert adaptive in [None, 'init', 'offset', 'importance']
+        self.adaptive = adaptive
+        assert conv_pos in ['pre', 'post']
+        self.conv_pos = conv_pos
+        self.with_res = with_res
+
+        self.norm_cfg = norm if isinstance(norm, dict) else dict(type=norm)
+        self.act_cfg = act if isinstance(act, dict) else dict(type=act)
+        self.bn = build_norm_layer(self.norm_cfg, out_channels)[1]
+        self.act = build_activation_layer(self.act_cfg)
+
+        if self.adaptive == 'init':
+            self.A = nn.Parameter(A.clone())
+        else:
+            self.register_buffer('A', A)
+
+        if self.adaptive in ['offset', 'importance']:
+            self.PA = nn.Parameter(A.clone())
+            if self.adaptive == 'offset':
+                nn.init.uniform_(self.PA, -1e-6, 1e-6)
+            elif self.adaptive == 'importance':
+                nn.init.constant_(self.PA, 1)
+
+        if self.conv_pos == 'pre':
+            self.conv = nn.Conv2d(in_channels, out_channels * A.size(0), 1)
+        elif self.conv_pos == 'post':
+            self.conv = nn.Conv2d(A.size(0) * in_channels, out_channels, 1)
+
+        if self.with_res:
+            if in_channels != out_channels:
+                self.down = Sequential(
+                    nn.Conv2d(in_channels, out_channels, 1),
+                    build_norm_layer(self.norm_cfg, out_channels)[1])
+            else:
+                self.down = lambda x: x
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        n, c, t, v = x.shape
+        res = self.down(x) if self.with_res else 0
+
+        A_switch = {None: self.A, 'init': self.A}
+        if hasattr(self, 'PA'):
+            A_switch.update({
+                'offset': self.A + self.PA,
+                'importance': self.A * self.PA
+            })
+        A = A_switch[self.adaptive]
+
+        if self.conv_pos == 'pre':
+            x = self.conv(x)
+            x = x.view(n, self.num_subsets, -1, t, v)
+            x = torch.einsum('nkctv,kvw->nctw', (x, A)).contiguous()
+        elif self.conv_pos == 'post':
+            x = torch.einsum('nctv,kvw->nkctw', (x, A)).contiguous()
+            x = x.view(n, -1, t, v)
+            x = self.conv(x)
+
+        return self.act(self.bn(x) + res)
+
+
+class unit_aagcn(BaseModule):
+    """The graph convolution unit of AAGCN.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        A (torch.Tensor): The adjacency matrix defined in the graph
+            with shape of `(num_subsets, num_joints, num_joints)`.
+        coff_embedding (int): The coefficient for downscaling the embedding
+            dimension. Defaults to 4.
+        adaptive (bool): Whether to use adaptive graph convolutional layer.
+            Defaults to True.
+        attention (bool): Whether to use the STC-attention module.
+            Defaults to True.
+        init_cfg (dict or list[dict]): Initialization config dict. Defaults to
+            ``[
+                dict(type='Constant', layer='BatchNorm2d', val=1,
+                     override=dict(type='Constant', name='bn', val=1e-6)),
+                dict(type='Kaiming', layer='Conv2d', mode='fan_out'),
+                dict(type='ConvBranch', name='conv_d')
+            ]``.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        A: torch.Tensor,
+        coff_embedding: int = 4,
+        adaptive: bool = True,
+        attention: bool = True,
+        init_cfg: Optional[Union[Dict, List[Dict]]] = [
+            dict(
+                type='Constant',
+                layer='BatchNorm2d',
+                val=1,
+                override=dict(type='Constant', name='bn', val=1e-6)),
+            dict(type='Kaiming', layer='Conv2d', mode='fan_out'),
+            dict(type='ConvBranch', name='conv_d')
+        ]
+    ) -> None:
+
+        if attention:
+            attention_init_cfg = [
+                dict(
+                    type='Constant',
+                    layer='Conv1d',
+                    val=0,
+                    override=dict(type='Xavier', name='conv_sa')),
+                dict(
+                    type='Kaiming',
+                    layer='Linear',
+                    mode='fan_in',
+                    override=dict(type='Constant', val=0, name='fc2c'))
+            ]
+            init_cfg = cp.copy(init_cfg)
+            init_cfg.extend(attention_init_cfg)
+
+        super(unit_aagcn, self).__init__(init_cfg=init_cfg)
+        inter_channels = out_channels // coff_embedding
+        self.inter_c = inter_channels
+        self.out_c = out_channels
+        self.in_c = in_channels
+        self.num_subset = A.shape[0]
+        self.adaptive = adaptive
+        self.attention = attention
+
+        num_joints = A.shape[-1]
+
+        self.conv_d = ModuleList()
+        for i in range(self.num_subset):
+            self.conv_d.append(nn.Conv2d(in_channels, out_channels, 1))
+
+        if self.adaptive:
+            self.A = nn.Parameter(A)
+
+            self.alpha = nn.Parameter(torch.zeros(1))
+            self.conv_a = ModuleList()
+            self.conv_b = ModuleList()
+            for i in range(self.num_subset):
+                self.conv_a.append(nn.Conv2d(in_channels, inter_channels, 1))
+                self.conv_b.append(nn.Conv2d(in_channels, inter_channels, 1))
+        else:
+            self.register_buffer('A', A)
+
+        if self.attention:
+            self.conv_ta = nn.Conv1d(out_channels, 1, 9, padding=4)
+            # s attention
+            ker_joint = num_joints if num_joints % 2 else num_joints - 1
+            pad = (ker_joint - 1) // 2
+            self.conv_sa = nn.Conv1d(out_channels, 1, ker_joint, padding=pad)
+            # channel attention
+            rr = 2
+            self.fc1c = nn.Linear(out_channels, out_channels // rr)
+            self.fc2c = nn.Linear(out_channels // rr, out_channels)
+
+        self.down = lambda x: x
+        if in_channels != out_channels:
+            self.down = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, 1),
+                nn.BatchNorm2d(out_channels))
+
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.tan = nn.Tanh()
+        self.sigmoid = nn.Sigmoid()
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        N, C, T, V = x.size()
+
+        y = None
+        if self.adaptive:
+            for i in range(self.num_subset):
+                A1 = self.conv_a[i](x).permute(0, 3, 1, 2).contiguous().view(
+                    N, V, self.inter_c * T)
+                A2 = self.conv_b[i](x).view(N, self.inter_c * T, V)
+                A1 = self.tan(torch.matmul(A1, A2) / A1.size(-1))  # N V V
+                A1 = self.A[i] + A1 * self.alpha
+                A2 = x.view(N, C * T, V)
+                z = self.conv_d[i](torch.matmul(A2, A1).view(N, C, T, V))
+                y = z + y if y is not None else z
+        else:
+            for i in range(self.num_subset):
+                A1 = self.A[i]
+                A2 = x.view(N, C * T, V)
+                z = self.conv_d[i](torch.matmul(A2, A1).view(N, C, T, V))
+                y = z + y if y is not None else z
+
+        y = self.relu(self.bn(y) + self.down(x))
+
+        if self.attention:
+            # spatial attention first
+            se = y.mean(-2)  # N C V
+            se1 = self.sigmoid(self.conv_sa(se))  # N 1 V
+            y = y * se1.unsqueeze(-2) + y
+            # then temporal attention
+            se = y.mean(-1)  # N C T
+            se1 = self.sigmoid(self.conv_ta(se))  # N 1 T
+            y = y * se1.unsqueeze(-1) + y
+            # then spatial temporal attention ??
+            se = y.mean(-1).mean(-1)  # N C
+            se1 = self.relu(self.fc1c(se))
+            se2 = self.sigmoid(self.fc2c(se1))  # N C
+            y = y * se2.unsqueeze(-1).unsqueeze(-1) + y
+            # A little bit weird
+        return y
+
+
+class unit_tcn(BaseModule):
+    """The basic unit of temporal convolutional network.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (int): Size of the temporal convolution kernel.
+            Defaults to 9.
+        stride (int): Stride of the temporal convolution. Defaults to 1.
+        dilation (int): Spacing between temporal kernel elements.
+            Defaults to 1.
+        norm (str): The name of norm layer. Defaults to ``'BN'``.
+        dropout (float): Dropout probability. Defaults to 0.
+        init_cfg (dict or list[dict]): Initialization config dict. Defaults to
+            ``[
+                dict(type='Constant', layer='BatchNorm2d', val=1),
+                dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+            ]``.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 9,
+        stride: int = 1,
+        dilation: int = 1,
+        norm: str = 'BN',
+        dropout: float = 0,
+        init_cfg: Union[Dict, List[Dict]] = [
+            dict(type='Constant', layer='BatchNorm2d', val=1),
+            dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.norm_cfg = norm if isinstance(norm, dict) else dict(type=norm)
+        pad = (kernel_size + (kernel_size - 1) * (dilation - 1) - 1) // 2
+
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=(kernel_size, 1),
+            padding=(pad, 0),
+            stride=(stride, 1),
+            dilation=(dilation, 1))
+        self.bn = build_norm_layer(self.norm_cfg, out_channels)[1] \
+            if norm is not None else nn.Identity()
+
+        self.drop = nn.Dropout(dropout, inplace=True)
+        self.stride = stride
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        return self.drop(self.bn(self.conv(x)))
+
+
+class mstcn(BaseModule):
+    """The multi-scale temporal convolutional network.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        mid_channels (int): Number of middle channels. Defaults to None.
+        dropout (float): Dropout probability. Defaults to 0.
+        ms_cfg (list): The config of multi-scale branches. Defaults to
+            ``[(3, 1), (3, 2), (3, 3), (3, 4), ('max', 3), '1x1']``.
+        stride (int): Stride of the temporal convolution. Defaults to 1.
+        init_cfg (dict or list[dict]): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 mid_channels: int = None,
+                 dropout: float = 0.,
+                 ms_cfg: List = [(3, 1), (3, 2), (3, 3), (3, 4), ('max', 3),
+                                 '1x1'],
+                 stride: int = 1,
+                 init_cfg: Union[Dict, List[Dict]] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        # Multiple branches of temporal convolution
+        self.ms_cfg = ms_cfg
+        num_branches = len(ms_cfg)
+        self.num_branches = num_branches
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.act = nn.ReLU()
+
+        if mid_channels is None:
+            mid_channels = out_channels // num_branches
+            rem_mid_channels = out_channels - mid_channels * (num_branches - 1)
+        else:
+            assert isinstance(mid_channels, float) and mid_channels > 0
+            mid_channels = int(out_channels * mid_channels)
+            rem_mid_channels = mid_channels
+
+        self.mid_channels = mid_channels
+        self.rem_mid_channels = rem_mid_channels
+
+        branches = []
+        for i, cfg in enumerate(ms_cfg):
+            branch_c = rem_mid_channels if i == 0 else mid_channels
+            if cfg == '1x1':
+                branches.append(
+                    nn.Conv2d(
+                        in_channels,
+                        branch_c,
+                        kernel_size=1,
+                        stride=(stride, 1)))
+                continue
+            assert isinstance(cfg, tuple)
+            if cfg[0] == 'max':
+                branches.append(
+                    Sequential(
+                        nn.Conv2d(in_channels, branch_c, kernel_size=1),
+                        nn.BatchNorm2d(branch_c), self.act,
+                        nn.MaxPool2d(
+                            kernel_size=(cfg[1], 1),
+                            stride=(stride, 1),
+                            padding=(1, 0))))
+                continue
+            assert isinstance(cfg[0], int) and isinstance(cfg[1], int)
+            branch = Sequential(
+                nn.Conv2d(in_channels, branch_c, kernel_size=1),
+                nn.BatchNorm2d(branch_c), self.act,
+                unit_tcn(
+                    branch_c,
+                    branch_c,
+                    kernel_size=cfg[0],
+                    stride=stride,
+                    dilation=cfg[1],
+                    norm=None))
+            branches.append(branch)
+
+        self.branches = ModuleList(branches)
+        tin_channels = mid_channels * (num_branches - 1) + rem_mid_channels
+
+        self.transform = Sequential(
+            nn.BatchNorm2d(tin_channels), self.act,
+            nn.Conv2d(tin_channels, out_channels, kernel_size=1))
+
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.drop = nn.Dropout(dropout, inplace=True)
+
+    def inner_forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        N, C, T, V = x.shape
+
+        branch_outs = []
+        for tempconv in self.branches:
+            out = tempconv(x)
+            branch_outs.append(out)
+
+        feat = torch.cat(branch_outs, dim=1)
+        feat = self.transform(feat)
+        return feat
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        out = self.inner_forward(x)
+        out = self.bn(out)
+        return self.drop(out)
diff --git a/mmaction/models/utils/graph.py b/mmaction/models/utils/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7f506f5fc2de86d32b91c13d0f971e85c6aa5bc
--- /dev/null
+++ b/mmaction/models/utils/graph.py
@@ -0,0 +1,218 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+
+
+def k_adjacency(A: Union[torch.Tensor, np.ndarray],
+                k: int,
+                with_self: bool = False,
+                self_factor: float = 1) -> np.ndarray:
+    """Construct k-adjacency matrix.
+
+    Args:
+        A (torch.Tensor or np.ndarray): The adjacency matrix.
+        k (int): The number of hops.
+        with_self (bool): Whether to add self-loops to the
+            k-adjacency matrix. The self-loops is critical
+            for learning the relationships between the current
+            joint and its k-hop neighbors. Defaults to False.
+        self_factor (float): The scale factor to the added
+            identity matrix. Defaults to 1.
+
+    Returns:
+        np.ndarray: The k-adjacency matrix.
+    """
+    # A is a 2D square array
+    if isinstance(A, torch.Tensor):
+        A = A.data.cpu().numpy()
+    assert isinstance(A, np.ndarray)
+    Iden = np.eye(len(A), dtype=A.dtype)
+    if k == 0:
+        return Iden
+    Ak = np.minimum(np.linalg.matrix_power(A + Iden, k), 1) - np.minimum(
+        np.linalg.matrix_power(A + Iden, k - 1), 1)
+    if with_self:
+        Ak += (self_factor * Iden)
+    return Ak
+
+
+def edge2mat(edges: List[Tuple[int, int]], num_node: int) -> np.ndarray:
+    """Get adjacency matrix from edges.
+
+    Args:
+        edges (list[tuple[int, int]]): The edges of the graph.
+        num_node (int): The number of nodes of the graph.
+
+    Returns:
+        np.ndarray: The adjacency matrix.
+    """
+    A = np.zeros((num_node, num_node))
+    for i, j in edges:
+        A[j, i] = 1
+    return A
+
+
+def normalize_digraph(A: np.ndarray, dim: int = 0) -> np.ndarray:
+    """Normalize the digraph according to the given dimension.
+
+    Args:
+        A (np.ndarray): The adjacency matrix.
+        dim (int): The dimension to perform normalization.
+            Defaults to 0.
+
+    Returns:
+        np.ndarray: The normalized adjacency matrix.
+    """
+    # A is a 2D square array
+    Dl = np.sum(A, dim)
+    h, w = A.shape
+    Dn = np.zeros((w, w))
+
+    for i in range(w):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i]**(-1)
+
+    AD = np.dot(A, Dn)
+    return AD
+
+
+def get_hop_distance(num_node: int,
+                     edges: List[Tuple[int, int]],
+                     max_hop: int = 1) -> np.ndarray:
+    """Get n-hop distance matrix by edges.
+
+    Args:
+        num_node (int): The number of nodes of the graph.
+        edges (list[tuple[int, int]]): The edges of the graph.
+        max_hop (int): The maximal distance between two connected nodes.
+            Defaults to 1.
+
+    Returns:
+        np.ndarray: The n-hop distance matrix.
+    """
+    A = np.eye(num_node)
+
+    for i, j in edges:
+        A[i, j] = 1
+        A[j, i] = 1
+
+    # compute hop steps
+    hop_dis = np.zeros((num_node, num_node)) + np.inf
+    transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]
+    arrive_mat = (np.stack(transfer_mat) > 0)
+    for d in range(max_hop, -1, -1):
+        hop_dis[arrive_mat[d]] = d
+    return hop_dis
+
+
+class Graph:
+    """The Graph to model the skeletons.
+
+    Args:
+        layout (str or dict): must be one of the following candidates:
+            'openpose', 'nturgb+d', 'coco', or a dict with the following
+            keys: 'num_node', 'inward', and 'center'.
+            Defaults to ``'coco'``.
+        mode (str): must be one of the following candidates:
+            'stgcn_spatial', 'spatial'. Defaults to ``'spatial'``.
+        max_hop (int): the maximal distance between two connected
+            nodes. Defaults to 1.
+    """
+
+    def __init__(self,
+                 layout: Union[str, dict] = 'coco',
+                 mode: str = 'spatial',
+                 max_hop: int = 1) -> None:
+
+        self.max_hop = max_hop
+        self.layout = layout
+        self.mode = mode
+
+        if isinstance(layout, dict):
+            assert 'num_node' in layout
+            assert 'inward' in layout
+            assert 'center' in layout
+        else:
+            assert layout in ['openpose', 'nturgb+d', 'coco']
+
+        self.set_layout(layout)
+        self.hop_dis = get_hop_distance(self.num_node, self.inward, max_hop)
+
+        assert hasattr(self, mode), f'Do Not Exist This Mode: {mode}'
+        self.A = getattr(self, mode)()
+
+    def __str__(self):
+        return self.A
+
+    def set_layout(self, layout: str) -> None:
+        """Initialize the layout of candidates."""
+
+        if layout == 'openpose':
+            self.num_node = 18
+            self.inward = [(4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12, 11),
+                           (10, 9), (9, 8), (11, 5), (8, 2), (5, 1), (2, 1),
+                           (0, 1), (15, 0), (14, 0), (17, 15), (16, 14)]
+            self.center = 1
+        elif layout == 'nturgb+d':
+            self.num_node = 25
+            neighbor_base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),
+                             (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),
+                             (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),
+                             (17, 1), (18, 17), (19, 18), (20, 19), (22, 8),
+                             (23, 8), (24, 12), (25, 12)]
+            self.inward = [(i - 1, j - 1) for (i, j) in neighbor_base]
+            self.center = 21 - 1
+        elif layout == 'coco':
+            self.num_node = 17
+            self.inward = [(15, 13), (13, 11), (16, 14), (14, 12), (11, 5),
+                           (12, 6), (9, 7), (7, 5), (10, 8), (8, 6), (5, 0),
+                           (6, 0), (1, 0), (3, 1), (2, 0), (4, 2)]
+            self.center = 0
+        elif isinstance(layout, dict):
+            self.num_node = layout['num_node']
+            self.inward = layout['inward']
+            self.center = layout['center']
+        else:
+            raise ValueError(f'Do Not Exist This Layout: {layout}')
+        self.self_link = [(i, i) for i in range(self.num_node)]
+        self.outward = [(j, i) for (i, j) in self.inward]
+        self.neighbor = self.inward + self.outward
+
+    def stgcn_spatial(self) -> np.ndarray:
+        """ST-GCN spatial mode."""
+        adj = np.zeros((self.num_node, self.num_node))
+        adj[self.hop_dis <= self.max_hop] = 1
+        normalize_adj = normalize_digraph(adj)
+        hop_dis = self.hop_dis
+        center = self.center
+
+        A = []
+        for hop in range(self.max_hop + 1):
+            a_close = np.zeros((self.num_node, self.num_node))
+            a_further = np.zeros((self.num_node, self.num_node))
+            for i in range(self.num_node):
+                for j in range(self.num_node):
+                    if hop_dis[j, i] == hop:
+                        if hop_dis[j, center] >= hop_dis[i, center]:
+                            a_close[j, i] = normalize_adj[j, i]
+                        else:
+                            a_further[j, i] = normalize_adj[j, i]
+            A.append(a_close)
+            if hop > 0:
+                A.append(a_further)
+        return np.stack(A)
+
+    def spatial(self) -> np.ndarray:
+        """Standard spatial mode."""
+        Iden = edge2mat(self.self_link, self.num_node)
+        In = normalize_digraph(edge2mat(self.inward, self.num_node))
+        Out = normalize_digraph(edge2mat(self.outward, self.num_node))
+        A = np.stack((Iden, In, Out))
+        return A
+
+    def binary_adj(self) -> np.ndarray:
+        """Construct an adjacency matrix for an undirected graph."""
+        A = edge2mat(self.neighbor, self.num_node)
+        return A[None]
diff --git a/mmaction/registry.py b/mmaction/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..539627c4c740fba8a32aeb2959b4d0fe31d40672
--- /dev/null
+++ b/mmaction/registry.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""MMAction provides 20 registry nodes to support using modules across
+projects. Each node is a child of the root registry in MMEngine.
+
+More details can be found at
+https://mmengine.readthedocs.io/en/latest/tutorials/registry.html.
+"""
+
+from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS
+from mmengine.registry import DATASETS as MMENGINE_DATASETS
+from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR
+from mmengine.registry import FUNCTIONS as MMENGINE_FUNCTION
+from mmengine.registry import HOOKS as MMENGINE_HOOKS
+from mmengine.registry import INFERENCERS as MMENGINE_INFERENCERS
+from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS
+from mmengine.registry import LOOPS as MMENGINE_LOOPS
+from mmengine.registry import METRICS as MMENGINE_METRICS
+from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS
+from mmengine.registry import MODELS as MMENGINE_MODELS
+from mmengine.registry import \
+    OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS
+from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS
+from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS
+from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS
+from mmengine.registry import \
+    RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS
+from mmengine.registry import RUNNERS as MMENGINE_RUNNERS
+from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS
+from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS
+from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS
+from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS
+from mmengine.registry import \
+    WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS
+from mmengine.registry import Registry
+
+# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner`
+RUNNERS = Registry(
+    'runner', parent=MMENGINE_RUNNERS, locations=['mmaction.engine.runner'])
+# manage runner constructors that define how to initialize runners
+RUNNER_CONSTRUCTORS = Registry(
+    'runner constructor',
+    parent=MMENGINE_RUNNER_CONSTRUCTORS,
+    locations=['mmaction.engine.runner'])
+# manage all kinds of loops like `EpochBasedTrainLoop`
+LOOPS = Registry(
+    'loop', parent=MMENGINE_LOOPS, locations=['mmaction.engine.runner'])
+# manage all kinds of hooks like `CheckpointHook`
+HOOKS = Registry(
+    'hook', parent=MMENGINE_HOOKS, locations=['mmaction.engine.hooks'])
+
+# manage data-related modules
+DATASETS = Registry(
+    'dataset', parent=MMENGINE_DATASETS, locations=['mmaction.datasets'])
+DATA_SAMPLERS = Registry(
+    'data sampler',
+    parent=MMENGINE_DATA_SAMPLERS,
+    locations=['mmaction.datasets'])
+TRANSFORMS = Registry(
+    'transform',
+    parent=MMENGINE_TRANSFORMS,
+    locations=['mmaction.datasets.transforms'])
+
+# manage all kinds of modules inheriting `nn.Module`
+MODELS = Registry(
+    'model', parent=MMENGINE_MODELS, locations=['mmaction.models'])
+# manage all kinds of model wrappers like 'MMDistributedDataParallel'
+MODEL_WRAPPERS = Registry(
+    'model_wrapper',
+    parent=MMENGINE_MODEL_WRAPPERS,
+    locations=['mmaction.models'])
+# manage all kinds of weight initialization modules like `Uniform`
+WEIGHT_INITIALIZERS = Registry(
+    'weight initializer',
+    parent=MMENGINE_WEIGHT_INITIALIZERS,
+    locations=['mmaction.models'])
+
+# manage all kinds of optimizers like `SGD` and `Adam`
+OPTIMIZERS = Registry(
+    'optimizer',
+    parent=MMENGINE_OPTIMIZERS,
+    locations=['mmaction.engine.optimizers'])
+# manage optimizer wrapper
+OPTIM_WRAPPERS = Registry(
+    'optim_wrapper',
+    parent=MMENGINE_OPTIM_WRAPPERS,
+    locations=['mmaction.engine.optimizers'])
+# manage constructors that customize the optimization hyperparameters.
+OPTIM_WRAPPER_CONSTRUCTORS = Registry(
+    'optimizer wrapper constructor',
+    parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS,
+    locations=['mmaction.engine.optimizers'])
+# manage all kinds of parameter schedulers like `MultiStepLR`
+PARAM_SCHEDULERS = Registry(
+    'parameter scheduler',
+    parent=MMENGINE_PARAM_SCHEDULERS,
+    locations=['mmaction.engine'])
+
+# manage all kinds of metrics
+METRICS = Registry(
+    'metric', parent=MMENGINE_METRICS, locations=['mmaction.evaluation'])
+# manage evaluator
+EVALUATOR = Registry(
+    'evaluator', parent=MMENGINE_EVALUATOR, locations=['mmaction.evaluation'])
+
+# manage task-specific modules like anchor generators and box coders
+TASK_UTILS = Registry(
+    'task util', parent=MMENGINE_TASK_UTILS, locations=['mmaction.models'])
+
+# manage visualizer
+VISUALIZERS = Registry(
+    'visualizer',
+    parent=MMENGINE_VISUALIZERS,
+    locations=['mmaction.visualization'])
+# manage visualizer backend
+VISBACKENDS = Registry(
+    'vis_backend',
+    parent=MMENGINE_VISBACKENDS,
+    locations=['mmaction.visualization'])
+
+# manage logprocessor
+LOG_PROCESSORS = Registry(
+    'log_processor',
+    parent=MMENGINE_LOG_PROCESSORS,
+    locations=['mmaction.engine'])
+
+# manage inferencer
+INFERENCERS = Registry(
+    'inferencer',
+    parent=MMENGINE_INFERENCERS,
+    locations=['mmaction.apis.inferencers'])
+
+# manage function
+FUNCTION = Registry(
+    'function', parent=MMENGINE_FUNCTION, locations=['mmaction.mmengine'])
+
+# Tokenizer to encode sequence
+TOKENIZER = Registry(
+    'tokenizer',
+    locations=['mmaction.models'],
+)
diff --git a/mmaction/structures/__init__.py b/mmaction/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d977d76d5a4afafd8a9a0ec5723242548489371
--- /dev/null
+++ b/mmaction/structures/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .action_data_sample import ActionDataSample
+from .bbox import bbox2result, bbox_target
+
+__all__ = [
+    'ActionDataSample',
+    'bbox2result',
+    'bbox_target',
+]
diff --git a/mmaction/structures/__pycache__/__init__.cpython-312.pyc b/mmaction/structures/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..809eb9cc6576d7317c2a7a51790efa2f2802709b
Binary files /dev/null and b/mmaction/structures/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/structures/__pycache__/action_data_sample.cpython-312.pyc b/mmaction/structures/__pycache__/action_data_sample.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..208036e70658207bc9a5bd7df5a46aa1413ebe14
Binary files /dev/null and b/mmaction/structures/__pycache__/action_data_sample.cpython-312.pyc differ
diff --git a/mmaction/structures/action_data_sample.py b/mmaction/structures/action_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..de61ae78cc655320330a22bc78a15305ae5cccbe
--- /dev/null
+++ b/mmaction/structures/action_data_sample.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Sequence, Union
+
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement, InstanceData
+from mmengine.utils import is_str
+
+LABEL_TYPE = Union[torch.Tensor, np.ndarray, Sequence, int]
+SCORE_TYPE = Union[torch.Tensor, np.ndarray, Sequence, Dict]
+
+
+def format_label(value: LABEL_TYPE) -> torch.Tensor:
+    """Convert various python types to label-format tensor.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int`.
+
+    Args:
+        value (torch.Tensor | numpy.ndarray | Sequence | int): Label value.
+
+    Returns:
+        :obj:`torch.Tensor`: The formatted label tensor.
+    """
+
+    # Handle single number
+    if isinstance(value, (torch.Tensor, np.ndarray)) and value.ndim == 0:
+        value = int(value.item())
+
+    if isinstance(value, np.ndarray):
+        value = torch.from_numpy(value).to(torch.long)
+    elif isinstance(value, Sequence) and not is_str(value):
+        value = torch.tensor(value).to(torch.long)
+    elif isinstance(value, int):
+        value = torch.LongTensor([value])
+    elif not isinstance(value, torch.Tensor):
+        raise TypeError(f'Type {type(value)} is not an available label type.')
+
+    return value
+
+
+def format_score(value: SCORE_TYPE) -> Union[torch.Tensor, Dict]:
+    """Convert various python types to score-format tensor.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`.
+
+    Args:
+        value (torch.Tensor | numpy.ndarray | Sequence | dict):
+            Score values or dict of scores values.
+
+    Returns:
+        :obj:`torch.Tensor` | dict: The formatted scores.
+    """
+
+    if isinstance(value, np.ndarray):
+        value = torch.from_numpy(value).float()
+    elif isinstance(value, Sequence) and not is_str(value):
+        value = torch.tensor(value).float()
+    elif isinstance(value, dict):
+        for k, v in value.items():
+            value[k] = format_score(v)
+    elif not isinstance(value, torch.Tensor):
+        raise TypeError(f'Type {type(value)} is not an available label type.')
+
+    return value
+
+
+class ActionDataSample(BaseDataElement):
+
+    def set_gt_label(self, value: LABEL_TYPE) -> 'ActionDataSample':
+        """Set `gt_label``."""
+        self.set_field(format_label(value), 'gt_label', dtype=torch.Tensor)
+        return self
+
+    def set_pred_label(self, value: LABEL_TYPE) -> 'ActionDataSample':
+        """Set ``pred_label``."""
+        self.set_field(format_label(value), 'pred_label', dtype=torch.Tensor)
+        return self
+
+    def set_pred_score(self, value: SCORE_TYPE) -> 'ActionDataSample':
+        """Set score of ``pred_label``."""
+        score = format_score(value)
+        self.set_field(score, 'pred_score')
+        if hasattr(self, 'num_classes'):
+            assert len(score) == self.num_classes, \
+                f'The length of score {len(score)} should be '\
+                f'equal to the num_classes {self.num_classes}.'
+        else:
+            self.set_field(
+                name='num_classes', value=len(score), field_type='metainfo')
+        return self
+
+    @property
+    def proposals(self):
+        """Property of `proposals`"""
+        return self._proposals
+
+    @proposals.setter
+    def proposals(self, value):
+        """Setter of `proposals`"""
+        self.set_field(value, '_proposals', dtype=InstanceData)
+
+    @proposals.deleter
+    def proposals(self):
+        """Deleter of `proposals`"""
+        del self._proposals
+
+    @property
+    def gt_instances(self):
+        """Property of `gt_instances`"""
+        return self._gt_instances
+
+    @gt_instances.setter
+    def gt_instances(self, value):
+        """Setter of `gt_instances`"""
+        self.set_field(value, '_gt_instances', dtype=InstanceData)
+
+    @gt_instances.deleter
+    def gt_instances(self):
+        """Deleter of `gt_instances`"""
+        del self._gt_instances
+
+    @property
+    def features(self):
+        """Setter of `features`"""
+        return self._features
+
+    @features.setter
+    def features(self, value):
+        """Setter of `features`"""
+        self.set_field(value, '_features', dtype=InstanceData)
+
+    @features.deleter
+    def features(self):
+        """Deleter of `features`"""
+        del self._features
diff --git a/mmaction/structures/bbox/__init__.py b/mmaction/structures/bbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e1e23c58af3e3901644ceac89770374eadbf725
--- /dev/null
+++ b/mmaction/structures/bbox/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_target import bbox_target
+from .transforms import bbox2result
+
+__all__ = ['bbox_target', 'bbox2result']
diff --git a/mmaction/structures/bbox/__pycache__/__init__.cpython-312.pyc b/mmaction/structures/bbox/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6826c75d11d17d9a58366f424e6f416c5aaa352
Binary files /dev/null and b/mmaction/structures/bbox/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/structures/bbox/__pycache__/bbox_target.cpython-312.pyc b/mmaction/structures/bbox/__pycache__/bbox_target.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a086add0455dad8d1b5abc0e6a1b638ba1d1eef
Binary files /dev/null and b/mmaction/structures/bbox/__pycache__/bbox_target.cpython-312.pyc differ
diff --git a/mmaction/structures/bbox/__pycache__/transforms.cpython-312.pyc b/mmaction/structures/bbox/__pycache__/transforms.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a1ad8d6dd29d27fd676fa005f17c70dfafe9877
Binary files /dev/null and b/mmaction/structures/bbox/__pycache__/transforms.cpython-312.pyc differ
diff --git a/mmaction/structures/bbox/bbox_target.py b/mmaction/structures/bbox/bbox_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..649724bb11ca63b56b90bd9d18f61d14325cce84
--- /dev/null
+++ b/mmaction/structures/bbox/bbox_target.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import mmengine
+import torch
+import torch.nn.functional as F
+
+
+def bbox_target(pos_bboxes_list: List[torch.Tensor],
+                neg_bboxes_list: List[torch.Tensor],
+                gt_labels: List[torch.Tensor],
+                cfg: Union[dict, mmengine.ConfigDict]) -> tuple:
+    """Generate classification targets for bboxes.
+
+    Args:
+        pos_bboxes_list (List[torch.Tensor]): Positive bboxes list.
+        neg_bboxes_list (List[torch.Tensor]): Negative bboxes list.
+        gt_labels (List[torch.Tensor]): Groundtruth classification label list.
+        cfg (dict | mmengine.ConfigDict): RCNN config.
+
+    Returns:
+        tuple: Label and label_weight for bboxes.
+    """
+    labels, label_weights = [], []
+    pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
+
+    assert len(pos_bboxes_list) == len(neg_bboxes_list) == len(gt_labels)
+    length = len(pos_bboxes_list)
+
+    for i in range(length):
+        pos_bboxes = pos_bboxes_list[i]
+        neg_bboxes = neg_bboxes_list[i]
+        gt_label = gt_labels[i]
+
+        num_pos = pos_bboxes.size(0)
+        num_neg = neg_bboxes.size(0)
+        num_samples = num_pos + num_neg
+        label = F.pad(gt_label, (0, 0, 0, num_neg))
+        label_weight = pos_bboxes.new_zeros(num_samples)
+        label_weight[:num_pos] = pos_weight
+        label_weight[-num_neg:] = 1.
+
+        labels.append(label)
+        label_weights.append(label_weight)
+
+    labels = torch.cat(labels, 0)
+    label_weights = torch.cat(label_weights, 0)
+    return labels, label_weights
diff --git a/mmaction/structures/bbox/transforms.py b/mmaction/structures/bbox/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbe4d3a0a6335721c3ed12093de8c20fb3d90147
--- /dev/null
+++ b/mmaction/structures/bbox/transforms.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+def bbox2result(bboxes: torch.Tensor,
+                labels: torch.Tensor,
+                num_classes: int,
+                thr: float = 0.01) -> list:
+    """Convert detection results to a list of numpy arrays.
+
+    This identifies single-label classification (as opposed to multi-label)
+    through the thr parameter which is set to a negative value.
+
+    ToDo: The ideal way would be for this to be automatically set when the
+    Currently, the way to set this is to set ``test_cfg.rcnn.action_thr=-1.0``
+    model cfg uses multilabel=False, however this could be a breaking change
+    and is left as a future exercise.
+    NB - this should not interfere with the evaluation in any case.
+
+    Args:
+        bboxes (torch.Tensor): shape ``(n, 4)``.
+        labels (torch.Tensor): shape ``(n, num_classes)``.
+        num_classes (int): class number, including background class.
+        thr (float): The score threshold used when converting predictions to
+            detection results. If a single negative value, uses single-label
+            classification.
+    Returns:
+        List(ndarray): bbox results of each class.
+    """
+    if bboxes.shape[0] == 0:
+        return list(np.zeros((num_classes - 1, 0, 5), dtype=np.float32))
+
+    bboxes = bboxes.cpu().numpy()
+    scores = labels.cpu().numpy()  # rename for clarification
+
+    # Although we can handle single-label classification, we still want scores
+    assert scores.shape[-1] > 1
+
+    # Robustly check for multi/single-label:
+    if not hasattr(thr, '__len__'):
+        multilabel = thr >= 0
+        thr = (thr, ) * num_classes
+    else:
+        multilabel = True
+
+    # Check Shape
+    assert scores.shape[1] == num_classes
+    assert len(thr) == num_classes
+
+    result = []
+    for i in range(num_classes - 1):
+        if multilabel:
+            where = (scores[:, i + 1] > thr[i + 1])
+        else:
+            where = (scores[:, 1:].argmax(axis=1) == i)
+        result.append(
+            np.concatenate((bboxes[where, :4], scores[where, i + 1:i + 2]),
+                           axis=1))
+    return result
diff --git a/mmaction/testing/__init__.py b/mmaction/testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2be44ffe3f2cf0a528f5c69fd4db116b3f8cc202
--- /dev/null
+++ b/mmaction/testing/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ._utils import (check_norm_state, generate_backbone_demo_inputs,
+                     generate_detector_demo_inputs, get_audio_recognizer_cfg,
+                     get_cfg, get_detector_cfg, get_localizer_cfg,
+                     get_recognizer_cfg, get_similarity_cfg,
+                     get_skeletongcn_cfg)
+
+__all__ = [
+    'check_norm_state', 'generate_backbone_demo_inputs', 'get_cfg',
+    'get_recognizer_cfg', 'get_audio_recognizer_cfg', 'get_localizer_cfg',
+    'get_detector_cfg', 'generate_detector_demo_inputs', 'get_skeletongcn_cfg',
+    'get_similarity_cfg'
+]
diff --git a/mmaction/testing/_utils.py b/mmaction/testing/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f452f6eaf542824626daf4e0fd7837a0bdc786f4
--- /dev/null
+++ b/mmaction/testing/_utils.py
@@ -0,0 +1,147 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine
+import numpy as np
+import torch
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+def generate_backbone_demo_inputs(input_shape=(1, 3, 64, 64)):
+    """Create a superset of inputs needed to run backbone.
+
+    Args:
+        input_shape (tuple): input batch dimensions.
+            Defaults to ``(1, 3, 64, 64)``.
+    """
+    imgs = np.random.random(input_shape)
+    imgs = torch.FloatTensor(imgs)
+
+    return imgs
+
+
+# TODO Remove this API
+def generate_recognizer_demo_inputs(
+        input_shape=(1, 3, 3, 224, 224), model_type='2D'):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        input_shape (tuple): input batch dimensions.
+            Default: (1, 250, 3, 224, 224).
+        model_type (str): Model type for data generation, from {'2D', '3D'}.
+            Default:'2D'
+    """
+    if len(input_shape) == 5:
+        (N, L, _, _, _) = input_shape
+    elif len(input_shape) == 6:
+        (N, M, _, L, _, _) = input_shape
+
+    imgs = np.random.random(input_shape)
+
+    if model_type == '2D' or model_type == 'skeleton':
+        gt_labels = torch.LongTensor([2] * N)
+    elif model_type == '3D':
+        gt_labels = torch.LongTensor([2] * M)
+    elif model_type == 'audio':
+        gt_labels = torch.LongTensor([2] * L)
+    else:
+        raise ValueError(f'Data type {model_type} is not available')
+
+    inputs = {'imgs': torch.FloatTensor(imgs), 'gt_labels': gt_labels}
+    return inputs
+
+
+def generate_detector_demo_inputs(
+        input_shape=(1, 3, 4, 224, 224), num_classes=81, train=True,
+        device='cpu'):
+    num_samples = input_shape[0]
+    if not train:
+        assert num_samples == 1
+
+    def random_box(n):
+        box = torch.rand(n, 4) * 0.5
+        box[:, 2:] += 0.5
+        box[:, 0::2] *= input_shape[3]
+        box[:, 1::2] *= input_shape[4]
+        if device == 'cuda':
+            box = box.cuda()
+        return box
+
+    def random_label(n):
+        label = torch.randn(n, num_classes)
+        label = (label > 0.8).type(torch.float32)
+        label[:, 0] = 0
+        if device == 'cuda':
+            label = label.cuda()
+        return label
+
+    img = torch.FloatTensor(np.random.random(input_shape))
+    if device == 'cuda':
+        img = img.cuda()
+
+    proposals = [random_box(2) for i in range(num_samples)]
+    gt_bboxes = [random_box(2) for i in range(num_samples)]
+    gt_labels = [random_label(2) for i in range(num_samples)]
+    img_metas = [dict(img_shape=input_shape[-2:]) for i in range(num_samples)]
+
+    if train:
+        return dict(
+            img=img,
+            proposals=proposals,
+            gt_bboxes=gt_bboxes,
+            gt_labels=gt_labels,
+            img_metas=img_metas)
+
+    return dict(img=[img], proposals=[proposals], img_metas=[img_metas])
+
+
+def get_cfg(config_type, fname):
+    """Grab configs necessary to create a recognizer.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config_types = ('recognition', 'recognition_audio', 'localization',
+                    'detection', 'skeleton', 'retrieval')
+    assert config_type in config_types
+
+    repo_dpath = osp.dirname(osp.dirname(osp.dirname(__file__)))
+    config_dpath = osp.join(repo_dpath, 'configs/' + config_type)
+    config_fpath = osp.join(config_dpath, fname)
+    if not osp.exists(config_dpath):
+        raise Exception('Cannot find config path')
+    config = mmengine.Config.fromfile(config_fpath)
+    return config
+
+
+def get_recognizer_cfg(fname):
+    return get_cfg('recognition', fname)
+
+
+def get_audio_recognizer_cfg(fname):
+    return get_cfg('recognition_audio', fname)
+
+
+def get_localizer_cfg(fname):
+    return get_cfg('localization', fname)
+
+
+def get_detector_cfg(fname):
+    return get_cfg('detection', fname)
+
+
+def get_skeletongcn_cfg(fname):
+    return get_cfg('skeleton', fname)
+
+
+def get_similarity_cfg(fname):
+    return get_cfg('retrieval', fname)
diff --git a/mmaction/utils/__init__.py b/mmaction/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2736dd102a20ac1c411e82922c687cee6d71dc6f
--- /dev/null
+++ b/mmaction/utils/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .collect_env import collect_env
+from .gradcam_utils import GradCAM
+from .misc import (VideoWriter, frame_extract, get_random_string, get_shm_dir,
+                   get_str_type, get_thread_id)
+from .progress import track, track_on_main_process
+from .setup_env import register_all_modules
+from .typing_utils import *  # noqa: F401,F403
+
+__all__ = [
+    'collect_env', 'get_random_string', 'get_thread_id', 'get_shm_dir',
+    'frame_extract', 'GradCAM', 'register_all_modules', 'VideoWriter',
+    'get_str_type', 'track', 'track_on_main_process'
+]
diff --git a/mmaction/utils/__pycache__/__init__.cpython-312.pyc b/mmaction/utils/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d353535d98b949582419e7d41c565c7663f82a9
Binary files /dev/null and b/mmaction/utils/__pycache__/__init__.cpython-312.pyc differ
diff --git a/mmaction/utils/__pycache__/collect_env.cpython-312.pyc b/mmaction/utils/__pycache__/collect_env.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff2d4c4d2dd87ed4cef35103b01a39ab2fd71b2e
Binary files /dev/null and b/mmaction/utils/__pycache__/collect_env.cpython-312.pyc differ
diff --git a/mmaction/utils/__pycache__/dependency.cpython-312.pyc b/mmaction/utils/__pycache__/dependency.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a2e4697e332581ee6dd420764931ea81676065b
Binary files /dev/null and b/mmaction/utils/__pycache__/dependency.cpython-312.pyc differ
diff --git a/mmaction/utils/__pycache__/gradcam_utils.cpython-312.pyc b/mmaction/utils/__pycache__/gradcam_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b5b880ad56b43e616b0bf9c64c64c46b6e71a74
Binary files /dev/null and b/mmaction/utils/__pycache__/gradcam_utils.cpython-312.pyc differ
diff --git a/mmaction/utils/__pycache__/misc.cpython-312.pyc b/mmaction/utils/__pycache__/misc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd3a6c88a10917dea1432e5d5f380e653f96b739
Binary files /dev/null and b/mmaction/utils/__pycache__/misc.cpython-312.pyc differ
diff --git a/mmaction/utils/__pycache__/progress.cpython-312.pyc b/mmaction/utils/__pycache__/progress.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd284bc727622389506bca68ab00fb7d20fbb7a8
Binary files /dev/null and b/mmaction/utils/__pycache__/progress.cpython-312.pyc differ
diff --git a/mmaction/utils/__pycache__/setup_env.cpython-312.pyc b/mmaction/utils/__pycache__/setup_env.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..381d8d5913d198377e0aa1dbcc2e5089f509bfeb
Binary files /dev/null and b/mmaction/utils/__pycache__/setup_env.cpython-312.pyc differ
diff --git a/mmaction/utils/__pycache__/typing_utils.cpython-312.pyc b/mmaction/utils/__pycache__/typing_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..857bf813192b1f6b8993cd6f1d335ced8bbe68a0
Binary files /dev/null and b/mmaction/utils/__pycache__/typing_utils.cpython-312.pyc differ
diff --git a/mmaction/utils/collect_env.py b/mmaction/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2e4314b7f1bb00e6b3c29a93aff39d9e284a88f
--- /dev/null
+++ b/mmaction/utils/collect_env.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+from mmengine.utils import get_git_hash
+from mmengine.utils.dl_utils import collect_env as collect_basic_env
+
+import mmaction
+
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = collect_basic_env()
+    env_info['MMAction2'] = (
+        mmaction.__version__ + '+' + get_git_hash(digits=7))
+    env_info['MMCV'] = (mmcv.__version__)
+
+    try:
+        import mmdet
+        env_info['MMDetection'] = (mmdet.__version__)
+    except ImportError:
+        pass
+
+    try:
+        import mmpose
+        env_info['MMPose'] = (mmpose.__version__)
+    except ImportError:
+        pass
+
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/mmaction/utils/dependency.py b/mmaction/utils/dependency.py
new file mode 100644
index 0000000000000000000000000000000000000000..61a045ede5c01d9c9bfc299da1a41238fef3720c
--- /dev/null
+++ b/mmaction/utils/dependency.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import re
+from functools import wraps
+from inspect import isfunction
+
+from importlib_metadata import PackageNotFoundError, distribution
+from mmengine.utils import digit_version
+
+
+def satisfy_requirement(dep):
+    pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+    parts = re.split(pat, dep, maxsplit=1)
+    parts = [p.strip() for p in parts]
+    package = parts[0]
+    if len(parts) > 1:
+        op, version = parts[1:]
+        op = {
+            '>=': '__ge__',
+            '==': '__eq__',
+            '>': '__gt__',
+            '<': '__lt__',
+            '<=': '__le__'
+        }[op]
+    else:
+        op, version = None, None
+
+    try:
+        dist = distribution(package)
+        if op is None or getattr(digit_version(dist.version), op)(
+                digit_version(version)):
+            return True
+    except PackageNotFoundError:
+        pass
+
+    return False
+
+
+def require(dep, install=None):
+    """A wrapper of function for extra package requirements.
+
+    Args:
+        dep (str): The dependency package name, like ``transformers``
+            or ``transformers>=4.28.0``.
+        install (str, optional): The installation command hint. Defaults
+            to None, which means to use "pip install dep".
+    """
+
+    def wrapper(fn):
+        assert isfunction(fn)
+
+        @wraps(fn)
+        def ask_install(*args, **kwargs):
+            name = fn.__qualname__.replace('.__init__', '')
+            ins = install or f'pip install "{dep}"'
+            raise ImportError(
+                f'{name} requires {dep}, please install it by `{ins}`.')
+
+        if satisfy_requirement(dep):
+            fn._verify_require = getattr(fn, '_verify_require', lambda: None)
+            return fn
+
+        ask_install._verify_require = ask_install
+        return ask_install
+
+    return wrapper
+
+
+WITH_MULTIMODAL = all(
+    satisfy_requirement(item) for item in ['transformers>=4.28.0'])
+
+
+def register_multimodal_placeholder(names, registry):
+    for name in names:
+
+        def ask_install(*args, **kwargs):
+            raise ImportError(
+                f'{name} requires extra multi-modal dependencies, please '
+                'install it by `pip install "mmaction2[multimodal]"` '
+                'or `pip install -e ".[multimodal]"`.')
+
+        registry.register_module(name=name, module=ask_install)
diff --git a/mmaction/utils/gradcam_utils.py b/mmaction/utils/gradcam_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7841485cb665fd7a83d46eb2081847dea97c99de
--- /dev/null
+++ b/mmaction/utils/gradcam_utils.py
@@ -0,0 +1,243 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class GradCAM:
+    """GradCAM class helps create visualization results.
+
+    Visualization results are blended by heatmaps and input images.
+    This class is modified from
+    https://github.com/facebookresearch/SlowFast/blob/master/slowfast/visualization/gradcam_utils.py # noqa
+    For more information about GradCAM, please visit:
+    https://arxiv.org/pdf/1610.02391.pdf
+
+    Args:
+        model (nn.Module): the recognizer model to be used.
+        target_layer_name (str): name of convolutional layer to
+            be used to get gradients and feature maps from for creating
+            localization maps.
+        colormap (str): matplotlib colormap used to create
+            heatmap. Defaults to 'viridis'. For more information, please visit
+            https://matplotlib.org/3.3.0/tutorials/colors/colormaps.html
+    """
+
+    def __init__(self,
+                 model: nn.Module,
+                 target_layer_name: str,
+                 colormap: str = 'viridis') -> None:
+        from ..models.recognizers import Recognizer2D, Recognizer3D
+        if isinstance(model, Recognizer2D):
+            self.is_recognizer2d = True
+        elif isinstance(model, Recognizer3D):
+            self.is_recognizer2d = False
+        else:
+            raise ValueError(
+                'GradCAM utils only support Recognizer2D & Recognizer3D.')
+
+        self.model = model
+        self.model.eval()
+        self.target_gradients = None
+        self.target_activations = None
+
+        import matplotlib.pyplot as plt
+        self.colormap = plt.get_cmap(colormap)
+        self._register_hooks(target_layer_name)
+
+    def _register_hooks(self, layer_name: str) -> None:
+        """Register forward and backward hook to a layer, given layer_name, to
+        obtain gradients and activations.
+
+        Args:
+            layer_name (str): name of the layer.
+        """
+
+        def get_gradients(module, grad_input, grad_output):
+            self.target_gradients = grad_output[0].detach()
+
+        def get_activations(module, input, output):
+            self.target_activations = output.clone().detach()
+
+        layer_ls = layer_name.split('/')
+        prev_module = self.model
+        for layer in layer_ls:
+            prev_module = prev_module._modules[layer]
+
+        target_layer = prev_module
+        target_layer.register_forward_hook(get_activations)
+        target_layer.register_backward_hook(get_gradients)
+
+    def _calculate_localization_map(self,
+                                    data: dict,
+                                    use_labels: bool,
+                                    delta=1e-20) -> tuple:
+        """Calculate localization map for all inputs with Grad-CAM.
+
+        Args:
+            data (dict): model inputs, generated by test pipeline,
+            use_labels (bool): Whether to use given labels to generate
+                localization map.
+            delta (float): used in localization map normalization,
+                must be small enough. Please make sure
+                `localization_map_max - localization_map_min >> delta`
+
+        Returns:
+            localization_map (torch.Tensor): the localization map for
+            input imgs.
+            preds (torch.Tensor): Model predictions with shape
+            (batch_size, num_classes).
+        """
+        inputs = data['inputs']
+
+        # use score before softmax
+        self.model.cls_head.average_clips = 'score'
+        # model forward & backward
+        results = self.model.test_step(data)
+        preds = [result.pred_score for result in results]
+        preds = torch.stack(preds)
+
+        if use_labels:
+            labels = [result.gt_label for result in results]
+            labels = torch.stack(labels)
+            score = torch.gather(preds, dim=1, index=labels)
+        else:
+            score = torch.max(preds, dim=-1)[0]
+        self.model.zero_grad()
+        score = torch.sum(score)
+        score.backward()
+
+        imgs = torch.stack(inputs)
+        if self.is_recognizer2d:
+            # [batch_size, num_segments, 3, H, W]
+            b, t, _, h, w = imgs.size()
+        else:
+            # [batch_size, num_crops*num_clips, 3, clip_len, H, W]
+            b1, b2, _, t, h, w = imgs.size()
+            b = b1 * b2
+
+        gradients = self.target_gradients
+        activations = self.target_activations
+        if self.is_recognizer2d:
+            # [B*Tg, C', H', W']
+            b_tg, c, _, _ = gradients.size()
+            tg = b_tg // b
+        else:
+            # source shape: [B, C', Tg, H', W']
+            _, c, tg, _, _ = gradients.size()
+            # target shape: [B, Tg, C', H', W']
+            gradients = gradients.permute(0, 2, 1, 3, 4)
+            activations = activations.permute(0, 2, 1, 3, 4)
+
+        # calculate & resize to [B, 1, T, H, W]
+        weights = torch.mean(gradients.view(b, tg, c, -1), dim=3)
+        weights = weights.view(b, tg, c, 1, 1)
+        activations = activations.view([b, tg, c] +
+                                       list(activations.size()[-2:]))
+        localization_map = torch.sum(
+            weights * activations, dim=2, keepdim=True)
+        localization_map = F.relu(localization_map)
+        localization_map = localization_map.permute(0, 2, 1, 3, 4)
+        localization_map = F.interpolate(
+            localization_map,
+            size=(t, h, w),
+            mode='trilinear',
+            align_corners=False)
+
+        # Normalize the localization map.
+        localization_map_min, localization_map_max = (
+            torch.min(localization_map.view(b, -1), dim=-1, keepdim=True)[0],
+            torch.max(localization_map.view(b, -1), dim=-1, keepdim=True)[0])
+        localization_map_min = torch.reshape(
+            localization_map_min, shape=(b, 1, 1, 1, 1))
+        localization_map_max = torch.reshape(
+            localization_map_max, shape=(b, 1, 1, 1, 1))
+        localization_map = (localization_map - localization_map_min) / (
+            localization_map_max - localization_map_min + delta)
+        localization_map = localization_map.data
+
+        return localization_map.squeeze(dim=1), preds
+
+    def _alpha_blending(self, localization_map: torch.Tensor,
+                        input_imgs: torch.Tensor,
+                        alpha: float) -> torch.Tensor:
+        """Blend heatmaps and model input images and get visulization results.
+
+        Args:
+            localization_map (torch.Tensor): localization map for all inputs,
+                generated with Grad-CAM.
+            input_imgs (torch.Tensor): model inputs, raw images.
+            alpha (float): transparency level of the heatmap,
+                in the range [0, 1].
+
+        Returns:
+            torch.Tensor: blending results for localization map and input
+            images, with shape [B, T, H, W, 3] and pixel values in
+            RGB order within range [0, 1].
+        """
+        # localization_map shape [B, T, H, W]
+        localization_map = localization_map.cpu()
+
+        # heatmap shape [B, T, H, W, 3] in RGB order
+        heatmap = self.colormap(localization_map.detach().numpy())
+        heatmap = heatmap[..., :3]
+        heatmap = torch.from_numpy(heatmap)
+        input_imgs = torch.stack(input_imgs)
+        # Permute input imgs to [B, T, H, W, 3], like heatmap
+        if self.is_recognizer2d:
+            # Recognizer2D input (B, T, C, H, W)
+            curr_inp = input_imgs.permute(0, 1, 3, 4, 2)
+        else:
+            # Recognizer3D input (B', num_clips*num_crops, C, T, H, W)
+            # B = B' * num_clips * num_crops
+            curr_inp = input_imgs.view([-1] + list(input_imgs.size()[2:]))
+            curr_inp = curr_inp.permute(0, 2, 3, 4, 1)
+
+        # renormalize input imgs to [0, 1]
+        curr_inp = curr_inp.cpu().float()
+        curr_inp /= 255.
+
+        # alpha blending
+        blended_imgs = alpha * heatmap + (1 - alpha) * curr_inp
+
+        return blended_imgs
+
+    def __call__(self,
+                 data: dict,
+                 use_labels: bool = False,
+                 alpha: float = 0.5) -> tuple:
+        """Visualize the localization maps on their corresponding inputs as
+        heatmap, using Grad-CAM.
+
+        Generate visualization results for **ALL CROPS**.
+        For example, for I3D model, if `clip_len=32, num_clips=10` and
+        use `ThreeCrop` in test pipeline, then for every model inputs,
+        there are 960(32*10*3) images generated.
+
+        Args:
+            data (dict): model inputs, generated by test pipeline.
+            use_labels (bool): Whether to use given labels to generate
+                localization map.
+            alpha (float): transparency level of the heatmap,
+                in the range [0, 1].
+
+        Returns:
+            blended_imgs (torch.Tensor): Visualization results, blended by
+            localization maps and model inputs.
+            preds (torch.Tensor): Model predictions for inputs.
+        """
+
+        # localization_map shape [B, T, H, W]
+        # preds shape [batch_size, num_classes]
+        localization_map, preds = self._calculate_localization_map(
+            data, use_labels=use_labels)
+
+        # blended_imgs shape [B, T, H, W, 3]
+        blended_imgs = self._alpha_blending(localization_map, data['inputs'],
+                                            alpha)
+
+        # blended_imgs shape [B, T, H, W, 3]
+        # preds shape [batch_size, num_classes]
+        # Recognizer2D: B = batch_size, T = num_segments
+        # Recognizer3D: B = batch_size * num_crops * num_clips, T = clip_len
+        return blended_imgs, preds
diff --git a/mmaction/utils/misc.py b/mmaction/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..c58963df4e973a6ca49567f743d61b0a5d14bbc7
--- /dev/null
+++ b/mmaction/utils/misc.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import ctypes
+import inspect
+import os
+import os.path as osp
+import random
+import string
+from types import FunctionType, ModuleType
+from typing import Optional, Union
+
+import cv2
+import mmcv
+import numpy as np
+
+
+def get_random_string(length: int = 15) -> str:
+    """Get random string with letters and digits.
+
+    Args:
+        length (int): Length of random string. Defaults to 15.
+    """
+    return ''.join(
+        random.choice(string.ascii_letters + string.digits)
+        for _ in range(length))
+
+
+def get_thread_id() -> int:
+    """Get current thread id."""
+    # use ctype to find thread id
+    thread_id = ctypes.CDLL('libc.so.6').syscall(186)
+    return thread_id
+
+
+def get_shm_dir() -> str:
+    """Get shm dir for temporary usage."""
+    return '/dev/shm'
+
+
+def frame_extract(video_path: str,
+                  short_side: Optional[int] = None,
+                  out_dir: str = './tmp'):
+    """Extract frames given video_path.
+
+    Args:
+        video_path (str): The video path.
+        short_side (int): Target short-side of the output image.
+            Defaults to None, means keeping original shape.
+        out_dir (str): The output directory. Defaults to ``'./tmp'``.
+    """
+    # Load the video, extract frames into OUT_DIR/video_name
+    target_dir = osp.join(out_dir, osp.basename(osp.splitext(video_path)[0]))
+    os.makedirs(target_dir, exist_ok=True)
+    # Should be able to handle videos up to several hours
+    frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
+    assert osp.exists(video_path), f'file not exit {video_path}'
+    vid = cv2.VideoCapture(video_path)
+    frames = []
+    frame_paths = []
+    flag, frame = vid.read()
+    cnt = 0
+    new_h, new_w = None, None
+    while flag:
+        if short_side is not None:
+            if new_h is None:
+                h, w, _ = frame.shape
+                new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf))
+            frame = mmcv.imresize(frame, (new_w, new_h))
+
+        frames.append(frame)
+        frame_path = frame_tmpl.format(cnt + 1)
+        frame_paths.append(frame_path)
+
+        cv2.imwrite(frame_path, frame)
+        cnt += 1
+        flag, frame = vid.read()
+
+    return frame_paths, frames
+
+
+class VideoWriter():
+
+    def __init__(self, video_file, fps):
+        self.video_file = video_file
+        self.fps = fps
+        if video_file.endswith('.mp4'):
+            self.fourcc = 'mp4v'
+        elif video_file.endswith('.avi'):
+            self.fourcc = 'XVID'
+
+        out_dir = osp.dirname(osp.abspath(self.video_file))
+        if not osp.exists(out_dir):
+            os.makedirs(out_dir, exist_ok=True)
+
+    def _init_cv2_writer(self, frame):
+        from cv2 import VideoWriter, VideoWriter_fourcc
+        height, width = frame.shape[:2]
+        resolution = (width, height)
+        self.writer = VideoWriter(self.video_file,
+                                  VideoWriter_fourcc(*self.fourcc), self.fps,
+                                  resolution)
+
+    def write_frame(self, frame):
+        if not getattr(self, 'writer', None):
+            self._init_cv2_writer(frame)
+        self.writer.write(frame)
+
+    def release(self):
+        self.writer.release()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, trace):
+        self.release()
+
+
+def get_str_type(module: Union[str, ModuleType, FunctionType]) -> str:
+    """Return the string type name of module.
+
+    Args:
+        module (str | ModuleType | FunctionType):
+            The target module class
+
+    Returns:
+        Class name of the module
+    """
+    if isinstance(module, str):
+        str_type = module
+    elif inspect.isclass(module) or inspect.isfunction(module):
+        str_type = module.__name__
+    else:
+        return None
+
+    return str_type
diff --git a/mmaction/utils/progress.py b/mmaction/utils/progress.py
new file mode 100644
index 0000000000000000000000000000000000000000..04e403befd45bf6571ffed5913fac53b2d04ca26
--- /dev/null
+++ b/mmaction/utils/progress.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import mmengine.dist as dist
+import rich.progress as progress
+from rich.live import Live
+
+disable_progress_bar = False
+global_progress = progress.Progress(
+    '{task.description}',
+    progress.BarColumn(),
+    progress.TaskProgressColumn(show_speed=True),
+    progress.TimeRemainingColumn(),
+)
+global_live = Live(global_progress, refresh_per_second=10)
+
+
+def track(sequence, description: str = '', total: Optional[float] = None):
+    if disable_progress_bar:
+        yield from sequence
+    else:
+        global_live.start()
+        task_id = global_progress.add_task(description, total=total)
+        task = global_progress._tasks[task_id]
+        try:
+            yield from global_progress.track(sequence, task_id=task_id)
+        finally:
+            if task.total is None:
+                global_progress.update(task_id, total=task.completed)
+            if all(task.finished for task in global_progress.tasks):
+                global_live.stop()
+                for task_id in global_progress.task_ids:
+                    global_progress.remove_task(task_id)
+
+
+def track_on_main_process(sequence, description='', total=None):
+    if not dist.is_main_process() or disable_progress_bar:
+        yield from sequence
+    else:
+        yield from track(sequence, total=total, description=description)
diff --git a/mmaction/utils/setup_env.py b/mmaction/utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..1153469265914249bb547258bf99e0b1cf5492b2
--- /dev/null
+++ b/mmaction/utils/setup_env.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import warnings
+
+from mmengine import DefaultScope
+
+
+def register_all_modules(init_default_scope: bool = True) -> None:
+    """Register all modules in mmaction into the registries.
+
+    Args:
+        init_default_scope (bool): Whether initialize the mmaction default
+            scope. If True, the global default scope will be set to `mmaction`,
+            and all registries will build modules from mmaction's registry
+            node. To understand more about the registry, please refer to
+            https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md
+            Defaults to True.
+    """
+    import mmaction.datasets  # noqa: F401,F403
+    import mmaction.engine  # noqa: F401,F403
+    import mmaction.evaluation  # noqa: F401,F403
+    import mmaction.models  # noqa: F401,F403
+    import mmaction.structures  # noqa: F401,F403
+    import mmaction.visualization  # noqa: F401,F403
+
+    if init_default_scope:
+        never_created = DefaultScope.get_current_instance() is None \
+                        or not DefaultScope.check_instance_created('mmaction')
+        if never_created:
+            DefaultScope.get_instance('mmaction', scope_name='mmaction')
+            return
+        current_scope = DefaultScope.get_current_instance()
+        if current_scope.scope_name != 'mmaction':
+            warnings.warn('The current default scope '
+                          f'"{current_scope.scope_name}" is not "mmaction", '
+                          '`register_all_modules` will force set the current'
+                          'default scope to "mmaction". If this is not as '
+                          'expected, please set `init_default_scope=False`.')
+            # avoid name conflict
+            new_instance_name = f'mmaction-{datetime.datetime.now()}'
+            DefaultScope.get_instance(new_instance_name, scope_name='mmaction')
diff --git a/mmaction/utils/typing_utils.py b/mmaction/utils/typing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2c1cb71f23a4c743a999dba3ad40c84df38dca0
--- /dev/null
+++ b/mmaction/utils/typing_utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Collecting some commonly used type hint in mmaction."""
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData, LabelData
+
+from mmaction.structures import ActionDataSample
+
+# Type hint of config data
+ConfigType = Union[ConfigDict, dict]
+OptConfigType = Optional[ConfigType]
+# Type hint of one or more config data
+MultiConfig = Union[ConfigType, List[ConfigType]]
+OptMultiConfig = Optional[MultiConfig]
+
+InstanceList = List[InstanceData]
+OptInstanceList = Optional[InstanceList]
+
+LabelList = List[LabelData]
+OptLabelList = Optional[LabelList]
+
+SampleList = List[ActionDataSample]
+OptSampleList = Optional[SampleList]
+
+ForwardResults = Union[Dict[str, torch.Tensor], List[ActionDataSample],
+                       Tuple[torch.Tensor], torch.Tensor]
+
+
+class SamplingResult:
+    """Dummy :class:`SamplingResult` in mmdet."""
+
+    def __init__(self, *args, **kwargs):
+        pass
diff --git a/mmaction/version.py b/mmaction/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cff4e8e6893591a60b966c85d3aec353b9bfd45
--- /dev/null
+++ b/mmaction/version.py
@@ -0,0 +1,26 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+
+__version__ = '1.2.0'
+
+
+def parse_version_info(version_str: str):
+    """Parse a version string into a tuple.
+
+    Args:
+        version_str (str): The version string.
+    Returns:
+        tuple[int or str]: The version info, e.g., "1.3.0" is parsed into
+            (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1').
+    """
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/mmaction/visualization/__init__.py b/mmaction/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4614aa285e08ede042a84f4f99b6845cf276769
--- /dev/null
+++ b/mmaction/visualization/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .action_visualizer import ActionVisualizer
+from .video_backend import (LocalVisBackend, TensorboardVisBackend,
+                            WandbVisBackend)
+
+__all__ = [
+    'ActionVisualizer', 'LocalVisBackend', 'WandbVisBackend',
+    'TensorboardVisBackend'
+]
diff --git a/mmaction/visualization/action_visualizer.py b/mmaction/visualization/action_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a695afa9ee0db3709dfec0085f66aea9588eb97
--- /dev/null
+++ b/mmaction/visualization/action_visualizer.py
@@ -0,0 +1,315 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import mmcv
+import numpy as np
+from mmengine.dist import master_only
+from mmengine.fileio.io import isdir, isfile, join_path, list_dir_or_file
+from mmengine.visualization import Visualizer
+
+from mmaction.registry import VISBACKENDS, VISUALIZERS
+from mmaction.structures import ActionDataSample
+
+
+def _get_adaptive_scale(img_shape: Tuple[int, int],
+                        min_scale: float = 0.3,
+                        max_scale: float = 3.0) -> float:
+    """Get adaptive scale according to frame shape.
+
+    The target scale depends on the the short edge length of the frame. If the
+    short edge length equals 224, the output is 1.0. And output linear scales
+    according the short edge length.
+
+    You can also specify the minimum scale and the maximum scale to limit the
+    linear scale.
+
+    Args:
+        img_shape (Tuple[int, int]): The shape of the canvas frame.
+        min_size (int): The minimum scale. Defaults to 0.3.
+        max_size (int): The maximum scale. Defaults to 3.0.
+
+    Returns:
+        int: The adaptive scale.
+    """
+    short_edge_length = min(img_shape)
+    scale = short_edge_length / 224.
+    return min(max(scale, min_scale), max_scale)
+
+
+@VISUALIZERS.register_module()
+class ActionVisualizer(Visualizer):
+    """Universal Visualizer for classification task.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        fig_save_cfg (dict): Keyword parameters of figure for saving.
+            Defaults to empty dict.
+        fig_show_cfg (dict): Keyword parameters of figure for showing.
+            Defaults to empty dict.
+
+    Examples:
+        >>> import torch
+        >>> import decord
+        >>> from pathlib import Path
+        >>> from mmaction.structures import ActionDataSample, ActionVisualizer
+        >>> from mmengine.structures import LabelData
+        >>> # Example frame
+        >>> video = decord.VideoReader('./demo/demo.mp4')
+        >>> video = video.get_batch(range(32)).asnumpy()
+        >>> # Example annotation
+        >>> data_sample = ActionDataSample()
+        >>> data_sample.gt_label = LabelData(item=torch.tensor([2]))
+        >>> # Setup the visualizer
+        >>> vis = ActionVisualizer(
+        ...     save_dir="./outputs",
+        ...     vis_backends=[dict(type='LocalVisBackend')])
+        >>> # Set classes names
+        >>> vis.dataset_meta = {'classes': ['running', 'standing', 'sitting']}
+        >>> # Save the visualization result by the specified storage backends.
+        >>> vis.add_datasample('demo', video)
+        >>> assert Path('outputs/vis_data/demo/frames_0/1.png').exists()
+        >>> assert Path('outputs/vis_data/demo/frames_0/2.png').exists()
+        >>> # Save another visualization result with the same name.
+        >>> vis.add_datasample('demo', video, step=1)
+        >>> assert Path('outputs/vis_data/demo/frames_1/2.png').exists()
+    """
+
+    def __init__(
+        self,
+        name='visualizer',
+        vis_backends: Optional[List[Dict]] = None,
+        save_dir: Optional[str] = None,
+        fig_save_cfg=dict(frameon=False),
+        fig_show_cfg=dict(frameon=False)
+    ) -> None:
+        super().__init__(
+            name=name,
+            image=None,
+            vis_backends=vis_backends,
+            save_dir=save_dir,
+            fig_save_cfg=fig_save_cfg,
+            fig_show_cfg=fig_show_cfg)
+
+    def _load_video(self,
+                    video: Union[np.ndarray, Sequence[np.ndarray], str],
+                    target_resolution: Optional[Tuple[int]] = None):
+        """Load video from multiple source and convert to target resolution.
+
+        Args:
+            video (np.ndarray, str): The video to draw.
+            target_resolution (Tuple[int], optional): Set to
+                (desired_width desired_height) to have resized frames. If
+                either dimension is None, the frames are resized by keeping
+                the existing aspect ratio. Defaults to None.
+        """
+        if isinstance(video, np.ndarray) or isinstance(video, list):
+            frames = video
+        elif isinstance(video, str):
+            # video file path
+            if isfile(video):
+                try:
+                    import decord
+                except ImportError:
+                    raise ImportError(
+                        'Please install decord to load video file.')
+                video = decord.VideoReader(video)
+                frames = [x.asnumpy()[..., ::-1] for x in video]
+            # rawframes folder path
+            elif isdir(video):
+                frame_list = sorted(list_dir_or_file(video, list_dir=False))
+                frames = [mmcv.imread(join_path(video, x)) for x in frame_list]
+        else:
+            raise TypeError(f'type of video {type(video)} not supported')
+
+        if target_resolution is not None:
+            w, h = target_resolution
+            frame_h, frame_w, _ = frames[0].shape
+            if w == -1:
+                w = int(h / frame_h * frame_w)
+            if h == -1:
+                h = int(w / frame_w * frame_h)
+            frames = [mmcv.imresize(f, (w, h)) for f in frames]
+
+        return frames
+
+    @master_only
+    def add_datasample(self,
+                       name: str,
+                       video: Union[np.ndarray, Sequence[np.ndarray], str],
+                       data_sample: Optional[ActionDataSample] = None,
+                       draw_gt: bool = True,
+                       draw_pred: bool = True,
+                       draw_score: bool = True,
+                       rescale_factor: Optional[float] = None,
+                       show_frames: bool = False,
+                       text_cfg: dict = dict(),
+                       wait_time: float = 0.1,
+                       out_path: Optional[str] = None,
+                       out_type: str = 'img',
+                       target_resolution: Optional[Tuple[int]] = None,
+                       step: int = 0,
+                       fps: int = 4) -> None:
+        """Draw datasample and save to all backends.
+
+        - If ``out_path`` is specified, all storage backends are ignored
+          and save the videos to the ``out_path``.
+        - If ``show_frames`` is True, plot the frames in a window sequentially,
+          please confirm you are able to access the graphical interface.
+
+        Args:
+            name (str): The frame identifier.
+            video (np.ndarray, str): The video to draw. supports decoded
+                np.ndarray, video file path, rawframes folder path.
+            data_sample (:obj:`ActionDataSample`, optional): The annotation of
+                the frame. Defaults to None.
+            draw_gt (bool): Whether to draw ground truth labels.
+                Defaults to True.
+            draw_pred (bool): Whether to draw prediction labels.
+                Defaults to True.
+            draw_score (bool): Whether to draw the prediction scores
+                of prediction categories. Defaults to True.
+            rescale_factor (float, optional): Rescale the frame by the rescale
+                factor before visualization. Defaults to None.
+            show_frames (bool): Whether to display the frames of the video.
+                Defaults to False.
+            text_cfg (dict): Extra text setting, which accepts
+                arguments of :attr:`mmengine.Visualizer.draw_texts`.
+                Defaults to an empty dict.
+            wait_time (float): Delay in seconds. 0 is the special
+                value that means "forever". Defaults to 0.1.
+            out_path (str, optional): Extra folder to save the visualization
+                result. If specified, the visualizer will only save the result
+                frame to the out_path and ignore its storage backends.
+                Defaults to None.
+            out_type (str): Output format type, choose from 'img', 'gif',
+                'video'. Defaults to ``'img'``.
+            target_resolution (Tuple[int], optional): Set to
+                (desired_width desired_height) to have resized frames. If
+                either dimension is None, the frames are resized by keeping
+                the existing aspect ratio. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+            fps (int): Frames per second for saving video. Defaults to 4.
+        """
+        classes = None
+        video = self._load_video(video, target_resolution)
+        tol_video = len(video)
+
+        if self.dataset_meta is not None:
+            classes = self.dataset_meta.get('classes', None)
+
+        if data_sample is None:
+            data_sample = ActionDataSample()
+
+        resulted_video = []
+        for frame_idx, frame in enumerate(video):
+            frame_name = 'frame %d of %s' % (frame_idx + 1, name)
+            if rescale_factor is not None:
+                frame = mmcv.imrescale(frame, rescale_factor)
+
+            texts = ['Frame %d of total %d frames' % (frame_idx, tol_video)]
+            self.set_image(frame)
+
+            if draw_gt and 'gt_labels' in data_sample:
+                gt_labels = data_sample.gt_label
+                idx = gt_labels.tolist()
+                class_labels = [''] * len(idx)
+                if classes is not None:
+                    class_labels = [f' ({classes[i]})' for i in idx]
+                labels = [
+                    str(idx[i]) + class_labels[i] for i in range(len(idx))
+                ]
+                prefix = 'Ground truth: '
+                texts.append(prefix + ('\n' + ' ' * len(prefix)).join(labels))
+
+            if draw_pred and 'pred_labels' in data_sample:
+                pred_labels = data_sample.pred_labels
+                idx = pred_labels.item.tolist()
+                score_labels = [''] * len(idx)
+                class_labels = [''] * len(idx)
+                if draw_score and 'score' in pred_labels:
+                    score_labels = [
+                        f', {pred_labels.score[i].item():.2f}' for i in idx
+                    ]
+
+                if classes is not None:
+                    class_labels = [f' ({classes[i]})' for i in idx]
+
+                labels = [
+                    str(idx[i]) + score_labels[i] + class_labels[i]
+                    for i in range(len(idx))
+                ]
+                prefix = 'Prediction: '
+                texts.append(prefix + ('\n' + ' ' * len(prefix)).join(labels))
+
+            img_scale = _get_adaptive_scale(frame.shape[:2])
+            _text_cfg = {
+                'positions':
+                np.array([(img_scale * 5, ) * 2]).astype(np.int32),
+                'font_sizes': int(img_scale * 7),
+                'font_families': 'monospace',
+                'colors': 'white',
+                'bboxes': dict(facecolor='black', alpha=0.5, boxstyle='Round'),
+            }
+            _text_cfg.update(text_cfg)
+            self.draw_texts('\n'.join(texts), **_text_cfg)
+            drawn_img = self.get_image()
+            resulted_video.append(drawn_img)
+
+        if show_frames:
+            frame_wait_time = 1. / fps
+            for frame_idx, drawn_img in enumerate(resulted_video):
+                frame_name = 'frame %d of %s' % (frame_idx + 1, name)
+                if frame_idx < len(resulted_video) - 1:
+                    wait_time = frame_wait_time
+                else:
+                    wait_time = wait_time
+                self.show(
+                    drawn_img[:, :, ::-1],
+                    win_name=frame_name,
+                    wait_time=wait_time)
+
+        resulted_video = np.array(resulted_video)
+        if out_path is not None:
+            save_dir, save_name = osp.split(out_path)
+            vis_backend_cfg = dict(type='LocalVisBackend', save_dir=save_dir)
+            tmp_local_vis_backend = VISBACKENDS.build(vis_backend_cfg)
+            tmp_local_vis_backend.add_video(
+                save_name,
+                resulted_video,
+                step=step,
+                fps=fps,
+                out_type=out_type)
+        else:
+            self.add_video(
+                name, resulted_video, step=step, fps=fps, out_type=out_type)
+        return resulted_video
+
+    @master_only
+    def add_video(
+        self,
+        name: str,
+        image: np.ndarray,
+        step: int = 0,
+        fps: int = 4,
+        out_type: str = 'img',
+    ) -> None:
+        """Record the image.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray, optional): The image to be saved. The format
+                should be RGB. Default to None.
+            step (int): Global step value to record. Default to 0.
+            fps (int): Frames per second for saving video. Defaults to 4.
+            out_type (str): Output format type, choose from 'img', 'gif',
+                'video'. Defaults to ``'img'``.
+        """
+        for vis_backend in self._vis_backends.values():
+            vis_backend.add_video(
+                name, image, step=step, fps=fps,
+                out_type=out_type)  # type: ignore
diff --git a/mmaction/visualization/video_backend.py b/mmaction/visualization/video_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cc549d1a99be377fbcf66730b74b310ed94816f
--- /dev/null
+++ b/mmaction/visualization/video_backend.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from typing import Optional
+
+import cv2
+import numpy as np
+from mmengine.visualization import (LocalVisBackend, TensorboardVisBackend,
+                                    WandbVisBackend)
+from mmengine.visualization.vis_backend import force_init_env
+
+from mmaction.registry import VISBACKENDS
+
+try:
+    import wandb
+except ImportError:
+    pass
+
+
+@VISBACKENDS.register_module()
+class LocalVisBackend(LocalVisBackend):
+    """Local visualization backend class with video support.
+
+    See mmengine.visualization.LocalVisBackend for more details.
+    """
+
+    @force_init_env
+    def add_video(self,
+                  name: str,
+                  frames: np.ndarray,
+                  step: int = 0,
+                  fps: Optional[int] = 4,
+                  out_type: Optional[int] = 'img',
+                  **kwargs) -> None:
+        """Record the frames of a video to disk.
+
+        Args:
+            name (str): The video identifier (frame folder).
+            frames (np.ndarray): The frames to be saved. The format
+                should be RGB. The shape should be (T, H, W, C).
+            step (int): Global step value to record. Defaults to 0.
+            out_type (str): Output format type, choose from 'img', 'gif',
+            'video'. Defaults to ``'img'``.
+            fps (int): Frames per second for saving video. Defaults to 4.
+        """
+        assert frames.dtype == np.uint8
+
+        if out_type == 'img':
+            frames_dir = osp.join(self._save_dir, name, f'frames_{step}')
+            os.makedirs(frames_dir, exist_ok=True)
+            for idx, frame in enumerate(frames):
+                drawn_image = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+                save_file_name = f'{idx}.png'
+                cv2.imwrite(osp.join(frames_dir, save_file_name), drawn_image)
+        else:
+            try:
+                from moviepy.editor import ImageSequenceClip
+            except ImportError:
+                raise ImportError('Please install moviepy to enable '
+                                  'output file.')
+
+            frames = [x[..., ::-1] for x in frames]
+            video_clips = ImageSequenceClip(frames, fps=fps)
+            name = osp.splitext(name)[0]
+            if out_type == 'gif':
+                out_path = osp.join(self._save_dir, name + '.gif')
+                video_clips.write_gif(out_path, logger=None)
+            elif out_type == 'video':
+                out_path = osp.join(self._save_dir, name + '.mp4')
+                video_clips.write_videofile(
+                    out_path, remove_temp=True, logger=None)
+
+
+@VISBACKENDS.register_module()
+class WandbVisBackend(WandbVisBackend):
+    """Wandb visualization backend class with video support. See
+    mmengine.visualization.WandbVisBackend for more details.
+
+    Note that this requires the ``wandb`` and ``moviepy`` package. A wandb
+    account login is also required at ``https://wandb.ai/authorize``.
+    """
+
+    @force_init_env
+    def add_video(self,
+                  name: str,
+                  frames: np.ndarray,
+                  fps: int = 4,
+                  **kwargs) -> None:
+        """Record the frames of a video to wandb.
+
+        Note that this requires the ``moviepy`` package.
+
+        Args:
+            name (str): The video identifier (frame folder).
+            frames (np.ndarray): The frames to be saved. The format
+                should be RGB. The shape should be (T, H, W, C).
+            step is a useless parameter that Wandb does not need.
+            fps (int): Frames per second. Defaults to 4.
+        """
+        frames = frames.transpose(0, 3, 1, 2)
+        self._wandb.log({'video': wandb.Video(frames, fps=fps, format='gif')})
+
+
+@VISBACKENDS.register_module()
+class TensorboardVisBackend(TensorboardVisBackend):
+    """Tensorboard visualization backend class with video support. See
+    mmengine.visualization.TensorboardVisBackend for more details.
+
+    Note that this requires the ``future`` and ``tensorboard`` package.
+    """
+
+    @force_init_env
+    def add_video(self,
+                  name: str,
+                  frames: np.ndarray,
+                  step: int = 0,
+                  fps: int = 4,
+                  **kwargs) -> None:
+        """Record the frames of a video to tensorboard.
+
+        Note that this requires the ``moviepy`` package.
+
+        Args:
+            name (str): The video identifier (frame folder).
+            frames (np.ndarray): The frames to be saved. The format
+                should be RGB. The shape should be (T, H, W, C).
+            step (int): Global step value to record. Defaults to 0.
+            fps (int): Frames per second. Defaults to 4.
+        """
+        frames = frames.transpose(0, 3, 1, 2)
+        frames = frames.reshape(1, *frames.shape)
+        self._tensorboard.add_video(name, frames, global_step=step, fps=fps)
diff --git a/model-index.yml b/model-index.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d35cdf44f9fcc9d4d41940f49d720651f4035ec7
--- /dev/null
+++ b/model-index.yml
@@ -0,0 +1,36 @@
+Import:
+  - configs/detection/acrn/metafile.yml
+  - configs/detection/lfb/metafile.yml
+  - configs/detection/slowfast/metafile.yml
+  - configs/detection/slowonly/metafile.yml
+  - configs/detection/videomae/metafile.yml
+  - configs/recognition/c2d/metafile.yml
+  - configs/recognition/c3d/metafile.yml
+  - configs/recognition/csn/metafile.yml
+  - configs/recognition/i3d/metafile.yml
+  - configs/recognition/mvit/metafile.yml
+  - configs/recognition/omnisource/metafile.yml
+  - configs/recognition/r2plus1d/metafile.yml
+  - configs/recognition/slowfast/metafile.yml
+  - configs/recognition/slowonly/metafile.yml
+  - configs/recognition/swin/metafile.yml
+  - configs/recognition/tanet/metafile.yml
+  - configs/recognition/timesformer/metafile.yml
+  - configs/recognition/tin/metafile.yml
+  - configs/recognition/tpn/metafile.yml
+  - configs/recognition/trn/metafile.yml
+  - configs/recognition/tsm/metafile.yml
+  - configs/recognition/tsn/metafile.yml
+  - configs/recognition/uniformer/metafile.yml
+  - configs/recognition/uniformerv2/metafile.yml
+  - configs/recognition/videomae/metafile.yml
+  - configs/recognition/videomaev2/metafile.yml
+  - configs/recognition/x3d/metafile.yml
+  - configs/recognition_audio/resnet/metafile.yml
+  - configs/localization/bmn/metafile.yml
+  - configs/localization/bsn/metafile.yml
+  - configs/retrieval/clip4clip/metafile.yml
+  - configs/skeleton/2s-agcn/metafile.yml
+  - configs/skeleton/posec3d/metafile.yml
+  - configs/skeleton/stgcn/metafile.yml
+  - configs/skeleton/stgcnpp/metafile.yml
diff --git a/projects/README.md b/projects/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8fcf90170057548748c9ed4e093881ae0c97d09a
--- /dev/null
+++ b/projects/README.md
@@ -0,0 +1,17 @@
+# Welcome to Projects of MMAction2
+
+In this folder, we welcome all contributions of deep-learning video understanding models from the community.
+
+Here, these requirements, e.g., code standards, are not as strict as in the core package. Thus, developers from the community can implement their algorithms much more easily and efficiently in MMAction2. We appreciate all contributions from the community to make MMAction2 greater.
+
+Here is an [example project](./example_project) about how to add your algorithms easily.
+
+We also provide some documentation listed below:
+
+- [Contribution Guide](https://mmaction2.readthedocs.io/en/latest/get_started/contribution_guide.html)
+
+  The guides for new contributors about how to add your projects to MMAction2.
+
+- [Discussions](https://github.com/open-mmlab/mmaction2/discussions)
+
+  Welcome to start a discussion!
diff --git a/projects/actionclip/README.md b/projects/actionclip/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7470963de970181e959581744ae6180f7a964880
--- /dev/null
+++ b/projects/actionclip/README.md
@@ -0,0 +1,186 @@
+# ActionCLIP Project
+
+[ActionCLIP: A New Paradigm for Video Action Recognition](https://arxiv.org/abs/2109.08472)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+The canonical approach to video action recognition dictates a neural model to do a classic and standard 1-of-N majority vote task. They are trained to predict a fixed set of predefined categories, limiting their transferable ability on new datasets with unseen concepts. In this paper, we provide a new perspective on action recognition by attaching importance to the semantic information of label texts rather than simply mapping them into numbers. Specifically, we model this task as a video-text matching problem within a multimodal learning framework, which strengthens the video representation with more semantic language supervision and enables our model to do zero-shot action recognition without any further labeled data or parameters requirements. Moreover, to handle the deficiency of label texts and make use of tremendous web data, we propose a new paradigm based on this multimodal learning framework for action recognition, which we dub "pre-train, prompt and fine-tune". This paradigm first learns powerful representations from pre-training on a large amount of web image-text or video-text data. Then it makes the action recognition task to act more like pre-training problems via prompt engineering. Finally, it end-to-end fine-tunes on target datasets to obtain strong performance. We give an instantiation of the new paradigm, ActionCLIP, which not only has superior and flexible zero-shot/few-shot transfer ability but also reaches a top performance on general action recognition task, achieving 83.8% top-1 accuracy on Kinetics-400 with a ViT-B/16 as the backbone.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://github-production-user-asset-6210df.s3.amazonaws.com/58767402/237413093-75d76018-0521-4642-af68-32141fb4fed1.png" width="800"/>
+</div>
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Installation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2. Run the following command to install `clip`.
+
+```shell
+pip install git+https://github.com/openai/CLIP.git
+```
+
+Assume that you are located at `$MMACTION2/projects/actionclip`.
+
+Add the current folder to `PYTHONPATH`, so that Python can find your code. Run the following command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the Kinetics400 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics/README.md).
+
+Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link.
+
+```shell
+ln -s ../../data ./data
+```
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmaction configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmaction configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmaction configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+### Kinetics400
+
+| frame sampling strategy | backbone | top1 acc | top5 acc |  testing protocol  |                                config                                |                                ckpt                                 |
+| :---------------------: | :------: | :------: | :------: | :----------------: | :------------------------------------------------------------------: | :-----------------------------------------------------------------: |
+|          1x1x8          | ViT-B/32 |   77.6   |   93.8   | 8 clips  x 1 crop  | [config](./configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb/vit-b-32-8f.pth)\[1\] |
+|          1x1x8          | ViT-B/16 |   80.3   |   95.2   | 8 clips  x 1 crop  | [config](./configs/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb/vit-b-16-8f.pth)\[1\] |
+|         1x1x16          | ViT-B/16 |   81.1   |   95.6   | 16 clips  x 1 crop | [config](./configs/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb/vit-b-16-16f.pth)\[1\] |
+|         1x1x32          | ViT-B/16 |   81.3   |   95.8   | 32 clips  x 1 crop | [config](./configs/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb/vit-b-16-32f.pth)\[1\] |
+
+\[1\] The models are ported from the repo [ActionCLIP](https://github.com/sallymmx/ActionCLIP) and tested on our data. Currently, we only support the testing of ActionCLIP models. Due to the variation in testing data, our reported test accuracy differs from that of the original repository (on average, it is lower by one point). Please refer to this [issue](https://github.com/sallymmx/ActionCLIP/issues/14) for more details.
+
+### Kinetics400 (Trained on Our K400 dataset)
+
+| frame sampling strategy | gpus | backbone | top1 acc | top5 acc | testing protocol  |                    config                     |                     ckpt                     |                     log                     |
+| :---------------------: | :--: | :------: | :------: | :------: | :---------------: | :-------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+|          1x1x8          |  8   | ViT-B/32 |   77.5   |   93.2   | 8 clips  x 1 crop | [config](./configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb_20230801-8535b794.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.log) |
+|          1x1x8          |  8   | ViT-B/16 |   81.3   |   95.2   | 8 clips  x 1 crop | [config](./configs/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb_20230801-b307a0cd.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb.log) |
+
+## Zero-Shot Prediction
+
+We offer two methods for zero-shot prediction as follows. The `test.mp4` can be downloaded from [here](https://github-production-user-asset-6210df.s3.amazonaws.com/58767402/237333525-89ebee9a-573e-4e27-9047-0ad6422fa82f.mp4).
+
+### Using Naive Pytorch
+
+```python
+import torch
+import clip
+from models.load import init_actionclip
+from mmaction.utils import register_all_modules
+
+register_all_modules(True)
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model, preprocess = init_actionclip('ViT-B/32-8', device=device)
+
+video_anno = dict(filename='test.mp4', start_index=0)
+video = preprocess(video_anno).unsqueeze(0).to(device)
+
+template = 'The woman is {}'
+labels = ['singing', 'dancing', 'performing']
+text = clip.tokenize([template.format(label) for label in labels]).to(device)
+
+with torch.no_grad():
+    video_features = model.encode_video(video)
+    text_features = model.encode_text(text)
+
+video_features /= video_features.norm(dim=-1, keepdim=True)
+text_features /= text_features.norm(dim=-1, keepdim=True)
+similarity = (100 * video_features @ text_features.T).softmax(dim=-1)
+probs = similarity.cpu().numpy()
+
+print("Label probs:", probs)  # [[9.995e-01 5.364e-07 6.666e-04]]
+```
+
+### Using MMAction2 APIs
+
+```python
+import mmengine
+import torch
+from mmaction.utils import register_all_modules
+from mmaction.apis import inference_recognizer, init_recognizer
+
+register_all_modules(True)
+
+config_path = 'configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py'
+checkpoint_path = 'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb/vit-b-32-8f.pth'
+template = 'The woman is {}'
+labels = ['singing', 'dancing', 'performing']
+
+# Update the labels, the default is the label list of K400.
+config = mmengine.Config.fromfile(config_path)
+config.model.labels_or_label_file = labels
+config.model.template = template
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = init_recognizer(config=config, checkpoint=checkpoint_path, device=device)
+
+pred_result = inference_recognizer(model, 'test.mp4')
+probs = pred_result.pred_score.cpu().numpy()
+print("Label probs:", probs)  # [9.995e-01 5.364e-07 6.666e-04]
+```
+
+## Citation
+
+<!-- Replace to the citation of the paper your project refers to. -->
+
+```bibtex
+@article{wang2021actionclip,
+  title={Actionclip: A new paradigm for video action recognition},
+  author={Wang, Mengmeng and Xing, Jiazheng and Liu, Yong},
+  journal={arXiv preprint arXiv:2109.08472},
+  year={2021}
+}
+```
diff --git a/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..39b8d1100d8f32236d4d33f1124b651307832899
--- /dev/null
+++ b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb.py
@@ -0,0 +1,52 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+num_segs = 16
+
+model = dict(
+    type='ActionClip',
+    clip_arch='ViT-B/16',
+    num_adapter_segs=num_segs,
+    num_adapter_layers=6,
+    labels_or_label_file='configs/label_map_k400.txt',
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[122.771, 116.746, 104.093],
+        std=[68.500, 66.632, 70.323],
+        format_shape='NCHW'))
+
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=num_segs,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type='AccMetric')
diff --git a/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b78fd201167251ce27a2fc9699e52e53f0a11f0
--- /dev/null
+++ b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb.py
@@ -0,0 +1,52 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+num_segs = 32
+
+model = dict(
+    type='ActionClip',
+    clip_arch='ViT-B/16',
+    num_adapter_segs=num_segs,
+    num_adapter_layers=6,
+    labels_or_label_file='configs/label_map_k400.txt',
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[122.771, 116.746, 104.093],
+        std=[68.500, 66.632, 70.323],
+        format_shape='NCHW'))
+
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=num_segs,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type='AccMetric')
diff --git a/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..d29281b9694b7891f1bd7f0d5a77d37c9392bc84
--- /dev/null
+++ b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb.py
@@ -0,0 +1,52 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+num_segs = 8
+
+model = dict(
+    type='ActionClip',
+    clip_arch='ViT-B/16',
+    num_adapter_segs=num_segs,
+    num_adapter_layers=6,
+    labels_or_label_file='configs/label_map_k400.txt',
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[122.771, 116.746, 104.093],
+        std=[68.500, 66.632, 70.323],
+        format_shape='NCHW'))
+
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=num_segs,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type='AccMetric')
diff --git a/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..a585b44e3112a6aa00ae3efaacac22dfa4f520ec
--- /dev/null
+++ b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py
@@ -0,0 +1,162 @@
+custom_imports = dict(imports='models')
+
+num_segs = 8
+
+model = dict(
+    type='ActionClip',
+    clip_arch='ViT-B/16',
+    num_adapter_segs=num_segs,
+    num_adapter_layers=6,
+    to_float32=True,
+    labels_or_label_file='configs/label_map_k400.txt',
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[122.771, 116.746, 104.093],
+        std=[68.500, 66.632, 70.323],
+        format_shape='NCHW'))
+
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+file_client_args = dict(
+    io_backend='petrel',
+    path_mapping=dict(
+        {'data/kinetics400/': 's3://openmmlab/datasets/action/Kinetics400/'}))
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames', clip_len=1, frame_interval=1, num_clips=num_segs),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, .875, .75, .66),
+        random_crop=False,
+        num_fixed_crops=13,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=num_segs,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = val_pipeline
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=16,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=16,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=16,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=5e-6, betas=(0.9, 0.98), eps=1e-08, weight_decay=0.2),
+    paramwise_cfg=dict(custom_keys=dict(adapter=dict(lr_mult=10))))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.01,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=45,
+        eta_min=0,
+        by_epoch=True,
+        begin=5,
+        end=50,
+        convert_to_iter_based=True)
+]
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
+
+default_scope = 'mmaction'
+
+default_hooks = dict(
+    runtime_info=dict(type='RuntimeInfoHook'),
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, ignore_last=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', interval=1, save_best='auto', max_keep_ckpts=5),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    sync_buffers=dict(type='SyncBuffersHook'))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends)
+
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e127429427fa37ea7144c20d35c404f7fb55ea2
--- /dev/null
+++ b/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py
@@ -0,0 +1,52 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+num_segs = 8
+
+model = dict(
+    type='ActionClip',
+    clip_arch='ViT-B/32',
+    num_adapter_segs=num_segs,
+    num_adapter_layers=6,
+    labels_or_label_file='configs/label_map_k400.txt',
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[122.771, 116.746, 104.093],
+        std=[68.500, 66.632, 70.323],
+        format_shape='NCHW'))
+
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=num_segs,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type='AccMetric')
diff --git a/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb38ebfb2ae413d78004aa1739ee7ea949e7816e
--- /dev/null
+++ b/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_g8xb16_1x1x8_k400-rgb.py
@@ -0,0 +1,162 @@
+custom_imports = dict(imports='models')
+
+num_segs = 8
+
+model = dict(
+    type='ActionClip',
+    clip_arch='ViT-B/32',
+    num_adapter_segs=num_segs,
+    num_adapter_layers=6,
+    to_float32=True,
+    labels_or_label_file='configs/label_map_k400.txt',
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[122.771, 116.746, 104.093],
+        std=[68.500, 66.632, 70.323],
+        format_shape='NCHW'))
+
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+file_client_args = dict(
+    io_backend='petrel',
+    path_mapping=dict(
+        {'data/kinetics400/': 's3://openmmlab/datasets/action/Kinetics400/'}))
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames', clip_len=1, frame_interval=1, num_clips=num_segs),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, .875, .75, .66),
+        random_crop=False,
+        num_fixed_crops=13,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=num_segs,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = val_pipeline
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=16,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=16,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=16,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=5e-6, betas=(0.9, 0.98), eps=1e-08, weight_decay=0.2),
+    paramwise_cfg=dict(custom_keys=dict(adapter=dict(lr_mult=10))))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.01,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=45,
+        eta_min=0,
+        by_epoch=True,
+        begin=5,
+        end=50,
+        convert_to_iter_based=True)
+]
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
+
+default_scope = 'mmaction'
+
+default_hooks = dict(
+    runtime_info=dict(type='RuntimeInfoHook'),
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, ignore_last=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', interval=1, save_best='auto', max_keep_ckpts=5),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    sync_buffers=dict(type='SyncBuffersHook'))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends)
+
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/projects/actionclip/configs/label_map_k400.txt b/projects/actionclip/configs/label_map_k400.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9193a07c6bda30b85b591da52e5e4cb375c31c06
--- /dev/null
+++ b/projects/actionclip/configs/label_map_k400.txt
@@ -0,0 +1,400 @@
+abseiling
+air drumming
+answering questions
+applauding
+applying cream
+archery
+arm wrestling
+arranging flowers
+assembling computer
+auctioning
+baby waking up
+baking cookies
+balloon blowing
+bandaging
+barbequing
+bartending
+beatboxing
+bee keeping
+belly dancing
+bench pressing
+bending back
+bending metal
+biking through snow
+blasting sand
+blowing glass
+blowing leaves
+blowing nose
+blowing out candles
+bobsledding
+bookbinding
+bouncing on trampoline
+bowling
+braiding hair
+breading or breadcrumbing
+breakdancing
+brush painting
+brushing hair
+brushing teeth
+building cabinet
+building shed
+bungee jumping
+busking
+canoeing or kayaking
+capoeira
+carrying baby
+cartwheeling
+carving pumpkin
+catching fish
+catching or throwing baseball
+catching or throwing frisbee
+catching or throwing softball
+celebrating
+changing oil
+changing wheel
+checking tires
+cheerleading
+chopping wood
+clapping
+clay pottery making
+clean and jerk
+cleaning floor
+cleaning gutters
+cleaning pool
+cleaning shoes
+cleaning toilet
+cleaning windows
+climbing a rope
+climbing ladder
+climbing tree
+contact juggling
+cooking chicken
+cooking egg
+cooking on campfire
+cooking sausages
+counting money
+country line dancing
+cracking neck
+crawling baby
+crossing river
+crying
+curling hair
+cutting nails
+cutting pineapple
+cutting watermelon
+dancing ballet
+dancing charleston
+dancing gangnam style
+dancing macarena
+deadlifting
+decorating the christmas tree
+digging
+dining
+disc golfing
+diving cliff
+dodgeball
+doing aerobics
+doing laundry
+doing nails
+drawing
+dribbling basketball
+drinking
+drinking beer
+drinking shots
+driving car
+driving tractor
+drop kicking
+drumming fingers
+dunking basketball
+dying hair
+eating burger
+eating cake
+eating carrots
+eating chips
+eating doughnuts
+eating hotdog
+eating ice cream
+eating spaghetti
+eating watermelon
+egg hunting
+exercising arm
+exercising with an exercise ball
+extinguishing fire
+faceplanting
+feeding birds
+feeding fish
+feeding goats
+filling eyebrows
+finger snapping
+fixing hair
+flipping pancake
+flying kite
+folding clothes
+folding napkins
+folding paper
+front raises
+frying vegetables
+garbage collecting
+gargling
+getting a haircut
+getting a tattoo
+giving or receiving award
+golf chipping
+golf driving
+golf putting
+grinding meat
+grooming dog
+grooming horse
+gymnastics tumbling
+hammer throw
+headbanging
+headbutting
+high jump
+high kick
+hitting baseball
+hockey stop
+holding snake
+hopscotch
+hoverboarding
+hugging
+hula hooping
+hurdling
+hurling (sport)
+ice climbing
+ice fishing
+ice skating
+ironing
+javelin throw
+jetskiing
+jogging
+juggling balls
+juggling fire
+juggling soccer ball
+jumping into pool
+jumpstyle dancing
+kicking field goal
+kicking soccer ball
+kissing
+kitesurfing
+knitting
+krumping
+laughing
+laying bricks
+long jump
+lunge
+making a cake
+making a sandwich
+making bed
+making jewelry
+making pizza
+making snowman
+making sushi
+making tea
+marching
+massaging back
+massaging feet
+massaging legs
+massaging person's head
+milking cow
+mopping floor
+motorcycling
+moving furniture
+mowing lawn
+news anchoring
+opening bottle
+opening present
+paragliding
+parasailing
+parkour
+passing American football (in game)
+passing American football (not in game)
+peeling apples
+peeling potatoes
+petting animal (not cat)
+petting cat
+picking fruit
+planting trees
+plastering
+playing accordion
+playing badminton
+playing bagpipes
+playing basketball
+playing bass guitar
+playing cards
+playing cello
+playing chess
+playing clarinet
+playing controller
+playing cricket
+playing cymbals
+playing didgeridoo
+playing drums
+playing flute
+playing guitar
+playing harmonica
+playing harp
+playing ice hockey
+playing keyboard
+playing kickball
+playing monopoly
+playing organ
+playing paintball
+playing piano
+playing poker
+playing recorder
+playing saxophone
+playing squash or racquetball
+playing tennis
+playing trombone
+playing trumpet
+playing ukulele
+playing violin
+playing volleyball
+playing xylophone
+pole vault
+presenting weather forecast
+pull ups
+pumping fist
+pumping gas
+punching bag
+punching person (boxing)
+push up
+pushing car
+pushing cart
+pushing wheelchair
+reading book
+reading newspaper
+recording music
+riding a bike
+riding camel
+riding elephant
+riding mechanical bull
+riding mountain bike
+riding mule
+riding or walking with horse
+riding scooter
+riding unicycle
+ripping paper
+robot dancing
+rock climbing
+rock scissors paper
+roller skating
+running on treadmill
+sailing
+salsa dancing
+sanding floor
+scrambling eggs
+scuba diving
+setting table
+shaking hands
+shaking head
+sharpening knives
+sharpening pencil
+shaving head
+shaving legs
+shearing sheep
+shining shoes
+shooting basketball
+shooting goal (soccer)
+shot put
+shoveling snow
+shredding paper
+shuffling cards
+side kick
+sign language interpreting
+singing
+situp
+skateboarding
+ski jumping
+skiing (not slalom or crosscountry)
+skiing crosscountry
+skiing slalom
+skipping rope
+skydiving
+slacklining
+slapping
+sled dog racing
+smoking
+smoking hookah
+snatch weight lifting
+sneezing
+sniffing
+snorkeling
+snowboarding
+snowkiting
+snowmobiling
+somersaulting
+spinning poi
+spray painting
+spraying
+springboard diving
+squat
+sticking tongue out
+stomping grapes
+stretching arm
+stretching leg
+strumming guitar
+surfing crowd
+surfing water
+sweeping floor
+swimming backstroke
+swimming breast stroke
+swimming butterfly stroke
+swing dancing
+swinging legs
+swinging on something
+sword fighting
+tai chi
+taking a shower
+tango dancing
+tap dancing
+tapping guitar
+tapping pen
+tasting beer
+tasting food
+testifying
+texting
+throwing axe
+throwing ball
+throwing discus
+tickling
+tobogganing
+tossing coin
+tossing salad
+training dog
+trapezing
+trimming or shaving beard
+trimming trees
+triple jump
+tying bow tie
+tying knot (not on a tie)
+tying tie
+unboxing
+unloading truck
+using computer
+using remote controller (not gaming)
+using segway
+vault
+waiting in line
+walking the dog
+washing dishes
+washing feet
+washing hair
+washing hands
+water skiing
+water sliding
+watering plants
+waxing back
+waxing chest
+waxing eyebrows
+waxing legs
+weaving basket
+welding
+whistling
+windsurfing
+wrapping present
+wrestling
+writing
+yawning
+yoga
+zumba
diff --git a/projects/actionclip/models/__init__.py b/projects/actionclip/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a4bdfcb32b4b2d12767cffc92713800cef905d5
--- /dev/null
+++ b/projects/actionclip/models/__init__.py
@@ -0,0 +1,4 @@
+from .actionclip import ActionClip
+from .load import init_actionclip
+
+__all__ = ['ActionClip', 'init_actionclip']
diff --git a/projects/actionclip/models/actionclip.py b/projects/actionclip/models/actionclip.py
new file mode 100644
index 0000000000000000000000000000000000000000..89975b82ea886dcfb20a49d366f05c9bc5f7f8e4
--- /dev/null
+++ b/projects/actionclip/models/actionclip.py
@@ -0,0 +1,176 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import clip
+import mmengine
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmengine.dist import all_gather, get_rank
+from mmengine.model import BaseModel
+from mmengine.structures import LabelData
+
+from mmaction.registry import MODELS
+from .adapter import TransformerAdapter
+
+
+class GatherLayer(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, input: torch.Tensor) -> Tuple[List]:
+        ctx.save_for_backward(input)
+        output = all_gather(input)
+        return tuple(output)
+
+    @staticmethod
+    def backward(ctx: Any, *grads: torch.Tensor) -> torch.Tensor:
+        input, = ctx.saved_tensors
+        grad_out = torch.zeros_like(input)
+        grad_out[:] = grads[get_rank()]
+        return grad_out
+
+
+def text_prompt(labels_or_label_file, templates_or_template_file=None):
+    if isinstance(labels_or_label_file, str):
+        labels = mmengine.list_from_file(labels_or_label_file)
+    elif isinstance(labels_or_label_file, list):
+        labels = labels_or_label_file
+    else:
+        raise ValueError(f'`labels_or_label_file` must be `list` or `str`, '
+                         f'but got {type(labels_or_label_file)}')
+
+    if templates_or_template_file is None:
+        templates = [
+            'a photo of action {}', 'a picture of action {}',
+            'Human action of {}', '{}, an action', '{} this is an action',
+            '{}, a video of action', 'Playing action of {}', '{}',
+            'Playing a kind of action, {}', 'Doing a kind of action, {}',
+            'Look, the human is {}', 'Can you recognize the action of {}?',
+            'Video classification of {}', 'A video of {}', 'The man is {}',
+            'The woman is {}'
+        ]
+    elif isinstance(templates_or_template_file, str):
+        templates = mmengine.list_from_file(templates_or_template_file)
+    elif not mmengine.is_seq_of(templates_or_template_file, str):
+        raise ValueError(f'`template` must be list of `str`, `str` or `None`, '
+                         f'but got {type(templates_or_template_file)}')
+
+    num_prompt = len(templates)
+    prompt = torch.cat(
+        [clip.tokenize(t.format(c)) for t in templates for c in labels])
+    return prompt, num_prompt
+
+
+@MODELS.register_module()
+class ActionClip(BaseModel):
+
+    def __init__(self,
+                 clip_arch: str,
+                 num_adapter_segs: int,
+                 num_adapter_layers: int = 6,
+                 to_float32: bool = False,
+                 labels_or_label_file: Optional[Union[List[str], str]] = None,
+                 templates_or_template_file: Optional[Union[List[str],
+                                                            str]] = None,
+                 data_preprocessor: Optional[Dict] = None,
+                 loss: Dict = dict(type='CrossEntropyLoss', loss_weight=0.5)):
+        super(ActionClip, self).__init__(data_preprocessor=data_preprocessor)
+        self.clip = clip.load(clip_arch, device='cpu')[0]
+        if to_float32:
+            self.clip.float()
+
+        self.adapter = TransformerAdapter(self.clip, num_adapter_segs,
+                                          num_adapter_layers)
+
+        self.loss = MODELS.build(loss)
+
+        if labels_or_label_file is not None:
+            self.prompt, self.num_prompt = text_prompt(
+                labels_or_label_file, templates_or_template_file)
+
+    def encode_video(self, video):
+        b, n, c, h, w = video.shape
+        video = video.view(-1, c, h, w)
+        frames_features = self.encode_image(video)
+        frames_features = frames_features.view(b, n, -1)
+        video_features = self.adapter(frames_features)
+        return video_features
+
+    def encode_image(self, image):
+        return self.clip.encode_image(image)
+
+    def encode_text(self, text):
+        return self.clip.encode_text(text)
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: Optional[List] = None,
+                mode: str = 'tensor'):
+
+        if mode == 'tensor':
+            return self.encode_video(inputs)
+
+        elif mode == 'predict':
+            assert hasattr(self, 'prompt'),\
+                '`labels_or_label_file` is required to perform prediction. '
+
+            video_features = self.encode_video(inputs)
+            video_features = video_features / video_features.norm(
+                dim=-1, keepdim=True)
+
+            bsz = len(data_samples)
+            num_views = video_features.shape[0] // bsz
+
+            text_features = self.encode_text(self.prompt.to(inputs.device))
+            text_features = text_features / text_features.norm(
+                dim=-1, keepdim=True)
+
+            # (bsz*num_views, num_prompt, num_classes) ->
+            # (bsz, num_views*num_prompt, num_classes)
+            similarity = (100.0 * video_features @ text_features.T). \
+                view(bsz, num_views * self.num_prompt, -1)
+
+            cls_scores = F.softmax(similarity, dim=2).mean(dim=1)
+
+            for data_sample, score in zip(data_samples, cls_scores):
+                data_sample.pred_scores = LabelData(item=score)
+
+            return data_samples
+
+        elif mode == 'loss':
+            video_features = self.encode_video(inputs)
+            video_features = video_features / video_features.norm(
+                dim=-1, keepdim=True)
+
+            text_id = np.random.randint(
+                self.num_prompt, size=len(data_samples))
+            real_labels = [x.gt_labels.item.item() for x in data_samples]
+            selected_prompt = self.prompt.view(
+                self.num_prompt, -1,
+                self.prompt.shape[-1])[text_id, real_labels].to(inputs.device)
+
+            text_features = self.encode_text(selected_prompt)
+            text_features = text_features / text_features.norm(
+                dim=-1, keepdim=True)
+
+            video_features = torch.cat(
+                GatherLayer.apply(video_features), dim=0)
+            text_features = torch.cat(GatherLayer.apply(text_features), dim=0)
+
+            logit_scale = self.clip.logit_scale.exp()
+            logits_per_video = logit_scale * video_features @ text_features.t()
+            logits_per_text = logits_per_video.t()
+            labels = torch.arange(logits_per_video.shape[0]).to(
+                logit_scale.device)
+
+            sim_loss_v2t = self.loss(logits_per_video, labels)
+            sim_loss_t2v = self.loss(logits_per_text, labels)
+
+            losses = dict()
+            losses['sim_loss_v2t'] = sim_loss_v2t
+            losses['sim_loss_t2v'] = sim_loss_t2v
+            return losses
+
+        else:
+            raise RuntimeError(
+                f'Invalid mode "{mode}". '
+                'Only supports `predict`, `loss` and `tensor` mode. ')
diff --git a/projects/actionclip/models/adapter.py b/projects/actionclip/models/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b86cffa8c64d6c964287fcfe5cf8f51a650a6f2c
--- /dev/null
+++ b/projects/actionclip/models/adapter.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+from clip.model import Transformer
+from mmengine.model import BaseModule
+
+
+class TransformerAdapter(BaseModule):
+
+    def __init__(self,
+                 clip_model: nn.Module,
+                 num_segs: int,
+                 num_layers: int = 6):
+        super(TransformerAdapter, self).__init__()
+        self.num_segs = num_segs
+
+        embed_dim = clip_model.text_projection.shape[1]
+        transformer_width = clip_model.ln_final.weight.shape[0]
+        transformer_heads = transformer_width // 64
+
+        self.frame_position_embeddings = nn.Embedding(self.num_segs, embed_dim)
+        self.transformer = Transformer(
+            width=embed_dim, layers=num_layers, heads=transformer_heads)
+
+    def init_weights(self):
+        for module in self.modules():
+            if isinstance(module, (nn.Linear, nn.Embedding)):
+                module.weight.data.normal_(mean=0.0, std=0.02)
+            elif isinstance(module, nn.LayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+
+    def forward(self, x: torch.Tensor):
+        b, seq_length, c = x.size()
+
+        x_original = x
+        position_ids = torch.arange(
+            seq_length, dtype=torch.long, device=x.device)
+        embeddings = self.frame_position_embeddings(position_ids)
+        x = x + embeddings.unsqueeze(0)
+        x = x.transpose(0, 1)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.transpose(0, 1)  # LND -> NLD
+        x = x.type(x_original.dtype) + x_original
+        return x.mean(dim=1)
diff --git a/projects/actionclip/models/load.py b/projects/actionclip/models/load.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf773c68f9e8d75ebd46fe2c270cc3142c57723
--- /dev/null
+++ b/projects/actionclip/models/load.py
@@ -0,0 +1,72 @@
+import torch
+from mmengine.dataset import Compose
+from mmengine.runner.checkpoint import _load_checkpoint
+from torchvision.transforms import Normalize
+
+from .actionclip import ActionClip
+
+_MODELS = {
+    'ViT-B/32-8':
+    'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb/vit-b-32-8f.pth',  # noqa: E501
+    'ViT-B/16-8':
+    'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb/vit-b-16-8f.pth',  # noqa: E501
+    'ViT-B/16-16':
+    'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb/vit-b-16-16f.pth',  # noqa: E501
+    'ViT-B/16-32':
+    'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb/vit-b-16-32f.pth',  # noqa: E501
+}
+
+
+def available_models():
+    """Returns the names of available ActionCLIP models."""
+    return list(_MODELS.keys())
+
+
+def _transform(num_segs):
+    pipeline = [
+        dict(type='DecordInit'),
+        dict(
+            type='SampleFrames',
+            clip_len=1,
+            frame_interval=1,
+            num_clips=num_segs,
+            test_mode=True),
+        dict(type='DecordDecode'),
+        dict(type='Resize', scale=(-1, 256)),
+        dict(type='CenterCrop', crop_size=224),
+        dict(type='FormatShape', input_format='NCHW'),
+        lambda x: torch.tensor(x['imgs']).div(255),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ]
+    return Compose(pipeline)
+
+
+def init_actionclip(name, device):
+    assert name in _MODELS, \
+        f'Model {name} not found; available models = {available_models()}'
+    model_path = _MODELS[name]
+
+    checkpoint = _load_checkpoint(model_path, map_location='cpu')
+    state_dict = checkpoint['state_dict']
+
+    clip_arch = name.split('-')[0] + '-' + name.split('-')[1]
+
+    num_adapter_segs = int(name.split('-')[2])
+    assert num_adapter_segs == \
+           state_dict['adapter.frame_position_embeddings.weight'].shape[0]
+    num_adapter_layers = len([
+        k for k in state_dict.keys()
+        if k.startswith('adapter.') and k.endswith('.attn.in_proj_weight')
+    ])
+
+    model = ActionClip(
+        clip_arch=clip_arch,
+        num_adapter_segs=num_adapter_segs,
+        num_adapter_layers=num_adapter_layers)
+
+    model.load_state_dict(state_dict)
+    model.to(device)
+    model.eval()
+
+    return model, _transform(num_adapter_segs)
diff --git a/projects/ctrgcn/README.md b/projects/ctrgcn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c7736849ea900fc0c40d0b7d6b9d64874554c80
--- /dev/null
+++ b/projects/ctrgcn/README.md
@@ -0,0 +1,113 @@
+# CTRGCN Project
+
+[Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition](https://arxiv.org/abs/2107.12213)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Graph convolutional networks (GCNs) have been widely used and achieved remarkable results in skeleton-based action recognition. In GCNs, graph topology dominates feature aggregation and therefore is the key to extracting representative features. In this work, we propose a novel Channel-wise Topology Refinement Graph Convolution (CTR-GC) to dynamically learn different topologies and effectively aggregate joint features in different channels for skeleton-based action recognition. The proposed CTR-GC models channel-wise topologies through learning a shared topology as a generic prior for all channels and refining it with channel-specific correlations for each channel. Our refinement method introduces few extra parameters and significantly reduces the difficulty of modeling channel-wise topologies. Furthermore, via reformulating graph convolutions into a unified form, we find that CTR-GC relaxes strict constraints of graph convolutions, leading to stronger representation capability. Combining CTR-GC with temporal modeling modules, we develop a powerful graph convolutional network named CTR-GCN which notably outperforms state-of-the-art methods on the NTU RGB+D, NTU RGB+D 120, and NW-UCLA datasets.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/58767402/223147561-9158fd51-8963-47c9-9338-de70470820cc.png" width="800"/>
+</div>
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Installation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2.
+
+Assume that you are located at `$MMACTION2/projects/ctrgcn`.
+
+Add the current folder to `PYTHONPATH`, so that Python can find your code. Run the following command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md).
+
+Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link.
+
+```shell
+ln -s ../../data ./data
+```
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+### NTU60_XSub_2D
+
+| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol |                     config                     |                     ckpt                     |                     log                     |
+| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+|       uniform 100       |  joint   |  8   |  CTRGCN  |   89.6   |     10 clips     | [config](./configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20230308-7aba454e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) |
+
+### NTU60_XSub_3D
+
+| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol |                     config                     |                     ckpt                     |                     log                     |
+| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+|       uniform 100       |  joint   |  8   |  CTRGCN  |   89.0   |     10 clips     | [config](./configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20230308-950dca0a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) |
+
+## Citation
+
+<!-- Replace to the citation of the paper your project refers to. -->
+
+```bibtex
+@inproceedings{chen2021channel,
+  title={Channel-wise topology refinement graph convolution for skeleton-based action recognition},
+  author={Chen, Yuxin and Zhang, Ziqi and Yuan, Chunfeng and Li, Bing and Deng, Ying and Hu, Weiming},
+  booktitle={CVPR},
+  pages={13359--13368},
+  year={2021}
+}
+```
diff --git a/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..3835d11fdd0e438f48751165eabfa00a36c47b68
--- /dev/null
+++ b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
@@ -0,0 +1,104 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+model = dict(
+    type='RecognizerGCN',
+    backbone=dict(
+        type='CTRGCN', graph_cfg=dict(layout='coco', mode='spatial')),
+    cls_head=dict(type='GCNHead', num_classes=60, in_channels=256))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+train_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(type='UniformSampleFrames', clip_len=100),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=10,
+        test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            pipeline=train_pipeline,
+            split='xsub_train')))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=val_pipeline,
+        split='xsub_val',
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=test_pipeline,
+        split='xsub_val',
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0,
+        T_max=16,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..03a1c302f53ff9136d9c0c49937b774c6cb4340f
--- /dev/null
+++ b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
@@ -0,0 +1,104 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+model = dict(
+    type='RecognizerGCN',
+    backbone=dict(
+        type='CTRGCN', graph_cfg=dict(layout='nturgb+d', mode='spatial')),
+    cls_head=dict(type='GCNHead', num_classes=60, in_channels=256))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_3d.pkl'
+train_pipeline = [
+    dict(type='PreNormalize3D'),
+    dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+    dict(type='UniformSampleFrames', clip_len=100),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='PreNormalize3D'),
+    dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='PreNormalize3D'),
+    dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=10,
+        test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            pipeline=train_pipeline,
+            split='xsub_train')))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=val_pipeline,
+        split='xsub_val',
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=test_pipeline,
+        split='xsub_val',
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0,
+        T_max=16,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/ctrgcn/models/__init__.py b/projects/ctrgcn/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cca6c4bd105b327a22cbd00a04b30c289ce2fb2
--- /dev/null
+++ b/projects/ctrgcn/models/__init__.py
@@ -0,0 +1,3 @@
+from .ctrgcn import CTRGCN
+
+__all__ = ['CTRGCN']
diff --git a/projects/ctrgcn/models/ctrgcn.py b/projects/ctrgcn/models/ctrgcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..73e884d5702df870eea9790f8bc4912aa243a8c3
--- /dev/null
+++ b/projects/ctrgcn/models/ctrgcn.py
@@ -0,0 +1,104 @@
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule, ModuleList
+
+from mmaction.models.utils import Graph, unit_tcn
+from mmaction.registry import MODELS
+from .ctrgcn_utils import MSTCN, unit_ctrgcn
+
+
+class CTRGCNBlock(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 stride=1,
+                 residual=True,
+                 kernel_size=5,
+                 dilations=[1, 2],
+                 tcn_dropout=0):
+        super(CTRGCNBlock, self).__init__()
+        self.gcn1 = unit_ctrgcn(in_channels, out_channels, A)
+        self.tcn1 = MSTCN(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilations=dilations,
+            residual=False,
+            tcn_dropout=tcn_dropout)
+        self.relu = nn.ReLU(inplace=True)
+        if not residual:
+            self.residual = lambda x: 0
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+        else:
+            self.residual = unit_tcn(
+                in_channels, out_channels, kernel_size=1, stride=stride)
+
+    def forward(self, x):
+        y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x))
+        return y
+
+
+@MODELS.register_module()
+class CTRGCN(BaseModule):
+
+    def __init__(self,
+                 graph_cfg,
+                 in_channels=3,
+                 base_channels=64,
+                 num_stages=10,
+                 inflate_stages=[5, 8],
+                 down_stages=[5, 8],
+                 pretrained=None,
+                 num_person=2,
+                 **kwargs):
+        super(CTRGCN, self).__init__()
+
+        self.graph = Graph(**graph_cfg)
+        A = torch.tensor(
+            self.graph.A, dtype=torch.float32, requires_grad=False)
+        self.register_buffer('A', A)
+
+        self.num_person = num_person
+        self.base_channels = base_channels
+
+        self.data_bn = nn.BatchNorm1d(num_person * in_channels * A.size(1))
+
+        kwargs0 = {k: v for k, v in kwargs.items() if k != 'tcn_dropout'}
+        modules = [
+            CTRGCNBlock(
+                in_channels,
+                base_channels,
+                A.clone(),
+                residual=False,
+                **kwargs0)
+        ]
+        for i in range(2, num_stages + 1):
+            in_channels = base_channels
+            out_channels = base_channels * (1 + (i in inflate_stages))
+            stride = 1 + (i in down_stages)
+            modules.append(
+                CTRGCNBlock(
+                    base_channels,
+                    out_channels,
+                    A.clone(),
+                    stride=stride,
+                    **kwargs))
+            base_channels = out_channels
+        self.net = ModuleList(modules)
+
+    def forward(self, x):
+        N, M, T, V, C = x.size()
+        x = x.permute(0, 1, 3, 4, 2).contiguous()
+        x = self.data_bn(x.view(N, M * V * C, T))
+        x = x.view(N, M, V, C, T).permute(0, 1, 3, 4,
+                                          2).contiguous().view(N * M, C, T, V)
+
+        for gcn in self.net:
+            x = gcn(x)
+
+        x = x.reshape((N, M) + x.shape[1:])
+        return x
diff --git a/projects/ctrgcn/models/ctrgcn_utils.py b/projects/ctrgcn/models/ctrgcn_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fe3a8529f95c6817141cc508e3abfc33fb47872
--- /dev/null
+++ b/projects/ctrgcn/models/ctrgcn_utils.py
@@ -0,0 +1,192 @@
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+
+from mmaction.models.utils import unit_tcn
+
+
+# ! Notice: The implementation of MSTCN in
+# MS-G3D is not the same as our implementation.
+class MSTCN(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 dilations=[1, 2, 3, 4],
+                 residual=True,
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=[
+                     dict(type='Constant', layer='BatchNorm2d', val=1),
+                     dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+                 ],
+                 tcn_dropout=0):
+
+        super().__init__(init_cfg=init_cfg)
+        # Multiple branches of temporal convolution
+        self.num_branches = len(dilations) + 2
+        branch_channels = out_channels // self.num_branches
+        branch_channels_rem = out_channels - branch_channels * (
+            self.num_branches - 1)
+
+        if type(kernel_size) == list:
+            assert len(kernel_size) == len(dilations)
+        else:
+            kernel_size = [kernel_size] * len(dilations)
+
+        self.branches = ModuleList([
+            Sequential(
+                nn.Conv2d(
+                    in_channels, branch_channels, kernel_size=1, padding=0),
+                nn.BatchNorm2d(branch_channels),
+                build_activation_layer(act_cfg),
+                unit_tcn(
+                    branch_channels,
+                    branch_channels,
+                    kernel_size=ks,
+                    stride=stride,
+                    dilation=dilation),
+            ) for ks, dilation in zip(kernel_size, dilations)
+        ])
+
+        # Additional Max & 1x1 branch
+        self.branches.append(
+            Sequential(
+                nn.Conv2d(
+                    in_channels, branch_channels, kernel_size=1, padding=0),
+                nn.BatchNorm2d(branch_channels),
+                build_activation_layer(act_cfg),
+                nn.MaxPool2d(
+                    kernel_size=(3, 1), stride=(stride, 1), padding=(1, 0)),
+                nn.BatchNorm2d(branch_channels)))
+
+        self.branches.append(
+            Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    branch_channels_rem,
+                    kernel_size=1,
+                    padding=0,
+                    stride=(stride, 1)), nn.BatchNorm2d(branch_channels_rem)))
+
+        # Residual connection
+        if not residual:
+            self.residual = lambda x: 0
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+        else:
+            self.residual = unit_tcn(
+                in_channels, out_channels, kernel_size=1, stride=stride)
+
+        self.act = build_activation_layer(act_cfg)
+        self.drop = nn.Dropout(tcn_dropout)
+
+    def forward(self, x):
+        # Input dim: (N,C,T,V)
+        res = self.residual(x)
+        branch_outs = []
+        for tempconv in self.branches:
+            out = tempconv(x)
+            branch_outs.append(out)
+
+        out = torch.cat(branch_outs, dim=1)
+        out += res
+        out = self.act(out)
+        out = self.drop(out)
+        return out
+
+
+class CTRGC(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 rel_reduction=8,
+                 init_cfg=[
+                     dict(type='Constant', layer='BatchNorm2d', val=1),
+                     dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+                 ]):
+        super(CTRGC, self).__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if in_channels <= 16:
+            self.rel_channels = 8
+        else:
+            self.rel_channels = in_channels // rel_reduction
+        self.conv1 = nn.Conv2d(
+            self.in_channels, self.rel_channels, kernel_size=1)
+        self.conv2 = nn.Conv2d(
+            self.in_channels, self.rel_channels, kernel_size=1)
+        self.conv3 = nn.Conv2d(
+            self.in_channels, self.out_channels, kernel_size=1)
+        self.conv4 = nn.Conv2d(
+            self.rel_channels, self.out_channels, kernel_size=1)
+        self.tanh = nn.Tanh()
+
+    def forward(self, x, A=None, alpha=1):
+        # Input: N, C, T, V
+        x1, x2, x3 = self.conv1(x).mean(-2), self.conv2(x).mean(
+            -2), self.conv3(x)
+        # X1, X2: N, R, V
+        # N, R, V, 1 - N, R, 1, V
+        x1 = self.tanh(x1.unsqueeze(-1) - x2.unsqueeze(-2))
+        # N, R, V, V
+        x1 = self.conv4(x1) * alpha + (A[None, None] if A is not None else 0
+                                       )  # N,C,V,V
+        x1 = torch.einsum('ncuv,nctu->nctv', x1, x3)
+        return x1
+
+
+class unit_ctrgcn(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 init_cfg=[
+                     dict(
+                         type='Constant',
+                         layer='BatchNorm2d',
+                         val=1,
+                         override=dict(type='Constant', name='bn', val=1e-6)),
+                     dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+                 ]):
+
+        super(unit_ctrgcn, self).__init__(init_cfg=init_cfg)
+        inter_channels = out_channels // 4
+        self.inter_c = inter_channels
+        self.out_c = out_channels
+        self.in_c = in_channels
+
+        self.num_subset = A.shape[0]
+        self.convs = ModuleList()
+
+        for i in range(self.num_subset):
+            self.convs.append(CTRGC(in_channels, out_channels))
+
+        if in_channels != out_channels:
+            self.down = Sequential(
+                nn.Conv2d(in_channels, out_channels, 1),
+                nn.BatchNorm2d(out_channels))
+        else:
+            self.down = lambda x: x
+
+        self.A = nn.Parameter(A.clone())
+
+        self.alpha = nn.Parameter(torch.zeros(1))
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.soft = nn.Softmax(-2)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        y = None
+
+        for i in range(self.num_subset):
+            z = self.convs[i](x, self.A[i], self.alpha)
+            y = z + y if y is not None else z
+
+        y = self.bn(y)
+        y += self.down(x)
+        return self.relu(y)
diff --git a/projects/example_project/README.md b/projects/example_project/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d78eb9b099164bd6dbc763a7056b4dc51c95e7a3
--- /dev/null
+++ b/projects/example_project/README.md
@@ -0,0 +1,122 @@
+# Example Project
+
+This is an example README for community `projects/`. You can write your README in your own project. Here are
+some recommended parts of a README for others to understand and use your project, you can copy or modify them
+according to your project.
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2.
+
+At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the Kinetics400 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics/README.md).
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc |  testing protocol  |                    config                     |                                   ckpt |                            log |
+| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :-------------------------------------------: | -------------------------------------: | -----------------------------: |
+|          1x1x3          |  224x224   |  8   | ResNet50 | ImageNet |  72.83   |  90.65   | 25 clips x 10 crop | [config](./configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://example/checkpoint/url) | [log](https://example/log/url) |
+
+## Citation
+
+<!-- Replace to the citation of the paper your project refers to. -->
+
+```bibtex
+@misc{2020mmaction2,
+  title={OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark},
+  author={MMAction2 Contributors},
+  howpublished = {\url{https://github.com/open-mmlab/mmaction2}},
+  year={2020}
+}
+```
+
+## Checklist
+
+Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects.
+
+- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [ ] Finish the code
+
+    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmaction.registry.MODELS` and configurable via a config file. -->
+
+  - [ ] Basic docstrings & proper citation
+
+    <!-- Each major class should contains a docstring, describing its functionality and arguments. If your code is copied or modified from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+
+  - [ ] Converted checkpoint and results (Only for reproduction)
+
+    <!-- If you are reproducing the result from a paper, make sure the model in the project can match that results. Also please provide checkpoint links or a checkpoint conversion script for others to get the pre-trained model. -->
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training results
+
+    <!-- If you are reproducing the result from a paper, train your model from scratch and verified that the final result can match the original result. Usually, ±0.1% is acceptable for the action recognition task on Kinetics400. -->
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Unit tests
+
+    <!-- Unit tests for the major module are required. [Example](https://github.com/open-mmlab/mmaction2/blob/main/tests/models/backbones/test_resnet.py) -->
+
+  - [ ] Code style
+
+    <!-- Refactor your code according to reviewer's comment. -->
+
+  - [ ] `metafile.yml` and `README.md`
+
+    <!-- It will used for MMAction2 to acquire your models. [Example](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/metafile.yml). In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/README.md) -->
diff --git a/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py b/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..61bb5310c6aba9efe69fd3b2df29d269ada067c2
--- /dev/null
+++ b/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py
@@ -0,0 +1,11 @@
+# Directly inherit the entire recipe you want to use.
+_base_ = 'mmaction::recognition/tsn/' \
+         'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'
+
+# This line is to import your own modules.
+custom_imports = dict(imports='models')
+
+# Modify the backbone to use your own backbone.
+_base_['model']['backbone'] = dict(type='ExampleNet', depth=50)
+# Modify the in_channels of classifier head to fit your backbone.
+_base_['model']['cls_head']['in_channels'] = 2048
diff --git a/projects/example_project/models/__init__.py b/projects/example_project/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..826d70dafe534369df6e5f7a36929726697bb2d9
--- /dev/null
+++ b/projects/example_project/models/__init__.py
@@ -0,0 +1,3 @@
+from .example_net import ExampleNet
+
+__all__ = ['ExampleNet']
diff --git a/projects/example_project/models/example_net.py b/projects/example_project/models/example_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..415251a7a76c75a20b03b574e8ebf64d54f0ea0d
--- /dev/null
+++ b/projects/example_project/models/example_net.py
@@ -0,0 +1,21 @@
+from mmaction.models import ResNet
+from mmaction.registry import MODELS
+
+
+# Register your model to the `MODELS`.
+@MODELS.register_module()
+class ExampleNet(ResNet):
+    """Implements an example backbone.
+
+    Implement the backbone network just like a normal pytorch network.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        print('#############################\n'
+              '#      Hello MMAction2!     #\n'
+              '#############################')
+        super().__init__(**kwargs)
+
+    def forward(self, x):
+        """Defines the computation performed at every call."""
+        return super().forward(x)
diff --git a/projects/gesture_recognition/README.md b/projects/gesture_recognition/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..519960dc0d7b0b3844a0bbd503d3a73428572ca5
--- /dev/null
+++ b/projects/gesture_recognition/README.md
@@ -0,0 +1,33 @@
+# Gesture Recognition
+
+<!-- [ALGORITHM] -->
+
+## Introduction
+
+<!-- [ABSTRACT] -->
+
+In this project, we present a skeleton based pipeline for gesture recognition. The pipeline is three-stage. The first stage consists of a hand detection module that outputs bounding boxes of human hands from video frames. Afterwards, the second stage employs a pose estimation module to generate keypoints of the detected hands. Finally, the third stage utilizes a skeleton-based gesture recognition module to classify hand actions based on the provided hand skeleton. The three-stage pipeline is lightweight and can achieve real-time on CPU devices. In this README, we provide the models and the inference demo for the project. Training data preparation and training scripts are described in [TRAINING.md](/projects/gesture_recognition/TRAINING.md).
+
+## Hand detection stage
+
+Hand detection results on OneHand10K validation dataset
+
+| Config                                                  | Input Size | bbox mAP | bbox mAP 50 | bbox mAP 75 |                         ckpt                          |                         log                          |
+| :------------------------------------------------------ | :--------: | :------: | :---------: | :---------: | :---------------------------------------------------: | :--------------------------------------------------: |
+| [rtmdet_nano](/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py) |  320x320   |  0.8100  |   0.9870    |   0.9190    | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320_20230524-f6ffed6a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.log) |
+
+## Pose estimation stage
+
+Pose estimation results on COCO-WholeBody-Hand validation set
+
+| Config                                                                                                 | Input Size | PCK@0.2 |  AUC  | EPE  |                  ckpt                   |
+| :----------------------------------------------------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :-------------------------------------: |
+| [rtmpose_m](/projects/gesture_recognition/configs/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py) |  256x256   |  0.815  | 0.837 | 4.51 | [ckpt](https://download.openmmlab.com/) |
+
+## Gesture recognition stage
+
+Skeleton base gesture recognition results on Jester validation
+
+| Config                                                  | Input Size | Top 1 accuracy | Top 5 accuracy |                          ckpt                          |                          log                          |
+| :------------------------------------------------------ | :--------: | :------------: | :------------: | :----------------------------------------------------: | :---------------------------------------------------: |
+| [STGCNPP](/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py) |  100x17x3  |     89.22      |     97.52      | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d_20230524-fffa7ff0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.log) |
diff --git a/projects/gesture_recognition/TRAINING.md b/projects/gesture_recognition/TRAINING.md
new file mode 100644
index 0000000000000000000000000000000000000000..abd2feb87c4527d989511c2db52e5cdf395288e5
--- /dev/null
+++ b/projects/gesture_recognition/TRAINING.md
@@ -0,0 +1,89 @@
+In this document, we show how to prepare the training data and train models required for this project.
+
+# Hand detection
+
+## Data Preparation
+
+We use multiple hand pose estimation datasets to generate a hand detection dataset. The circumscribed rectangle of hand key points of is used as the detection bounding box of the hand. In our demo, we use 4 datasets supported from [MMPose](https://github.com/open-mmlab/mmpose): [FreiHAND Dataset](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_hand_keypoint.html#freihand-dataset), [OneHand10K](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_hand_keypoint.html#onehand10k), [RHD Dataset](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_hand_keypoint.html#rhd-dataset) and [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe). You can find instructions for preparing each dataset from the corresponding link.
+
+To train the hand detection model, you need to install [MMDet](https://github.com/open-mmlab/mmdetection) and move (or link) the above datasets to `$MMDet/data/`. The folder structure should look like this:
+
+```
+mmdetection
+├── mmdetection
+├── docs
+├── tests
+├── tools
+├── configs
+|── data
+    |-- freihand
+       │-- annotations
+       │-- ..
+    |-- onehand10k
+       │-- annotations
+       │-- ..
+    |-- rhd
+       │-- annotations
+       │-- ..
+    │-- halpe
+       │-- annotations
+       |-- hico_20160224_det
+          │-- images
+          |-- ..
+       │-- ..
+```
+
+We provide a [parse_pose.py](/projects/gesture_recognition/parse_pose.py) file to convert the annotation files of the above pose datasets to a COCO-style detection annotation. Suppose you are at `$MMDet/data`, run the following command and it will generate `hand_det_train.json` and `hand_det_val.json` at `$MMDet/data/hand_det/`
+
+```
+python3 $MMAction/projects/gesture_recognition/parse_pose.py
+```
+
+The training annotation file combines the above four data sets, and the validation annotation file just uses the OneHand10K validation for a quick verification. You can also add more hand detection datasets to improve performance. Now we are done with data preparation.
+
+## Training and inference
+
+We provide a [config](/projects/gesture_recognition/configs/rtmdet_nano_320-8xb32_multi-dataset-hand.py) to train a [RTMDet](https://arxiv.org/abs/2212.07784) detection model. Suppose you are at `$MMDet`, you can run the follow command to train the hand detection model with 8 GPUs:
+
+```bash
+bash tools/dist_train.sh $MMAction/projects/gesture_recognition/configs/rtmdet_nano_320-8xb32_multi-dataset-hand.py 8
+```
+
+To see the detection result for a single image, we can use `$MMDet/demo/image_demo.py`. The follow command will do inference on a single [image](/projects/gesture_recognition/demo/hand_det.jpg) (from a video in the [jester dataset](/tools/data/jester)) and the output should be similar to [this image](/projects/gesture_recognition/demo/hand_det_out.jpg).
+
+```bash
+python3 $MMDet/demo/image_demo.py $MMAction/projects/gesture_recognition/demo/hand_det.jpg PATH_TO_HAND_DET_CHECKPOINT --out-dir='.'
+```
+
+# Pose estimation
+
+We directly use the pose estimation model from MMPose. Please refer to [RTMPose](https://github.com/open-mmlab/mmpose/tree/main/configs/hand_2d_keypoint/rtmpose) for details.
+
+# Gesture recognition
+
+## Data Preparation
+
+We use the [jester dataset](/tools/data/jester)) to train a skeleton based gesture recognition model. Please follow the link to prepare this dataset (in frames).
+
+Once we have the jester dataset, we provide the [extract_keypoint.py](/projects/gesture_recognition/extract_keypoint.py) to extract the hand keypoints for all video frames in the dataset. This step requires the hand detection model and the pose estimation model in the above two stages. Here is an example to extract the keypoints for the dataset. You may need to modify the path to the dataset, configs or checkpoints according to your system.
+
+```bash
+ROOT_TO_JESTER='20bn-jester-v1'
+POSE_CONFIG='rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py'
+POSE_CKPT='rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320_20230524-f6ffed6a.pth'
+DET_CONFIG='rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py'
+DET_CKPT='hand-cocktail5-4e-4-bs256-210e-b74fb594_20230320.pth'
+python3 -u extract_keypoint.py $ROOT_TO_JESTER \
+    --pose_config $POSE_CONFIG --pose_ckpt $POSE_CKPT \
+    --det_config $DET_CONFIG --det-ckpt $DET_CKPT
+```
+
+The program will generate a `jester.pkl` file in your current directory. Then move this file to `$MMAction`. We will use this file for skeleton based gesture recognition training.
+
+## Training and inference
+
+We provide a [config](/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py) to train a STGCN++ model. Suppose you are at `$MMAction`, you can run the follow command to train the model with 8 GPUs:
+
+```bash
+bash tools/dist_train.sh $MMAction/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-80e_jester-keypoint-2d.py 8
+```
diff --git a/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py b/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py
new file mode 100644
index 0000000000000000000000000000000000000000..f91b71f12fa314ed2d331d258c48d2cea1b862ed
--- /dev/null
+++ b/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py
@@ -0,0 +1,123 @@
+_base_ = 'mmdet::rtmdet/rtmdet_nano_8xb32-300e_coco.py'
+
+input_shape = 320
+
+model = dict(
+    backbone=dict(
+        deepen_factor=0.33,
+        widen_factor=0.25,
+        use_depthwise=True,
+    ),
+    neck=dict(
+        in_channels=[64, 128, 256],
+        out_channels=64,
+        num_csp_blocks=1,
+        use_depthwise=True,
+    ),
+    bbox_head=dict(
+        in_channels=64,
+        feat_channels=64,
+        share_conv=False,
+        exp_on_reg=False,
+        use_depthwise=True,
+        num_classes=1),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+data_root = 'data/'
+file_client_args = dict(backend='disk')
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='CachedMosaic',
+        img_scale=(input_shape, input_shape),
+        pad_val=114.0,
+        max_cached_images=20,
+        random_pop=False),
+    dict(
+        type='RandomResize',
+        scale=(input_shape * 2, input_shape * 2),
+        ratio_range=(0.5, 1.5),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(input_shape, input_shape)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Pad',
+        size=(input_shape, input_shape),
+        pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs')
+]
+
+train_pipeline_stage2 = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=(input_shape, input_shape),
+        ratio_range=(0.5, 1.5),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(input_shape, input_shape)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Pad',
+        size=(input_shape, input_shape),
+        pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='Resize', scale=(input_shape, input_shape), keep_ratio=True),
+    dict(
+        type='Pad',
+        size=(input_shape, input_shape),
+        pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='hand_det/hand_det_train.json',
+        data_prefix=dict(img=''),
+        pipeline=train_pipeline,
+        metainfo=dict(classes=('hand', )),
+    ))
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='hand_det/hand_det_val.json',
+        data_prefix=dict(img=''),
+        pipeline=test_pipeline,
+        metainfo=dict(classes=('hand', )),
+    ))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root + 'hand_det/hand_det_val.json')
+test_evaluator = val_evaluator
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='PipelineSwitchHook',
+        switch_epoch=280,
+        switch_pipeline=train_pipeline_stage2)
+]
diff --git a/projects/gesture_recognition/configs/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py b/projects/gesture_recognition/configs/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fbecfa61e0c2dfe3becc9e1d738989d004588c0
--- /dev/null
+++ b/projects/gesture_recognition/configs/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py
@@ -0,0 +1,339 @@
+default_scope = 'mmpose'
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook',
+        interval=10,
+        save_best='AUC',
+        rule='greater',
+        max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='PoseVisualizationHook', enable=False))
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=180,
+        switch_pipeline=[
+            dict(type='LoadImage', file_client_args=dict(backend='disk')),
+            dict(type='GetBBoxCenterScale'),
+            dict(
+                type='RandomBBoxTransform',
+                shift_factor=0.0,
+                scale_factor=[0.75, 1.25],
+                rotate_factor=180),
+            dict(type='RandomFlip', direction='horizontal'),
+            dict(type='TopdownAffine', input_size=(256, 256)),
+            dict(type='mmdet.YOLOXHSVRandomAug'),
+            dict(
+                type='Albumentation',
+                transforms=[
+                    dict(type='Blur', p=0.1),
+                    dict(type='MedianBlur', p=0.1),
+                    dict(
+                        type='CoarseDropout',
+                        max_holes=1,
+                        max_height=0.4,
+                        max_width=0.4,
+                        min_holes=1,
+                        min_height=0.2,
+                        min_width=0.2,
+                        p=0.5)
+                ]),
+            dict(
+                type='GenerateTarget',
+                encoder=dict(
+                    type='SimCCLabel',
+                    input_size=(256, 256),
+                    sigma=(5.66, 5.66),
+                    simcc_split_ratio=2.0,
+                    normalize=False,
+                    use_dark=False)),
+            dict(type='PackPoseInputs')
+        ])
+]
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[dict(type='LocalVisBackend')],
+    name='visualizer')
+log_processor = dict(
+    type='LogProcessor', window_size=50, by_epoch=True, num_digits=6)
+log_level = 'INFO'
+load_from = None
+resume = False
+file_client_args = dict(backend='disk')
+train_cfg = dict(by_epoch=True, max_epochs=210, val_interval=10)
+val_cfg = dict()
+test_cfg = dict()
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 0.004
+randomness = dict(seed=21)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-05, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0.0002,
+        begin=105,
+        end=210,
+        T_max=105,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+auto_scale_lr = dict(base_batch_size=256)
+codec = dict(
+    type='SimCCLabel',
+    input_size=(256, 256),
+    sigma=(5.66, 5.66),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False)
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        _scope_='mmdet',
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=0.67,
+        widen_factor=0.75,
+        out_indices=(4, ),
+        channel_attention=True,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU'),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint=('https://download.openmmlab.com/mmpose/v1/projects/'
+                        'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-'
+                        'f2f7d6f6_20230130.pth'))),
+    head=dict(
+        type='RTMCCHead',
+        in_channels=768,
+        out_channels=21,
+        input_size=(256, 256),
+        in_featuremap_size=(8, 8),
+        simcc_split_ratio=2.0,
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.0,
+            drop_path=0.0,
+            act_fn='SiLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=dict(
+            type='KLDiscretLoss',
+            use_target_weight=True,
+            beta=10.0,
+            label_softmax=True),
+        decoder=dict(
+            type='SimCCLabel',
+            input_size=(256, 256),
+            sigma=(5.66, 5.66),
+            simcc_split_ratio=2.0,
+            normalize=False,
+            use_dark=False)),
+    test_cfg=dict(flip_test=True))
+dataset_type = 'CocoWholeBodyHandDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+train_pipeline = [
+    dict(type='LoadImage', file_client_args=dict(backend='disk')),
+    dict(type='GetBBoxCenterScale'),
+    dict(
+        type='RandomBBoxTransform', scale_factor=[0.5, 1.5],
+        rotate_factor=180),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='TopdownAffine', input_size=(256, 256)),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0)
+        ]),
+    dict(
+        type='GenerateTarget',
+        encoder=dict(
+            type='SimCCLabel',
+            input_size=(256, 256),
+            sigma=(5.66, 5.66),
+            simcc_split_ratio=2.0,
+            normalize=False,
+            use_dark=False)),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', file_client_args=dict(backend='disk')),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=(256, 256)),
+    dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+    dict(type='LoadImage', file_client_args=dict(backend='disk')),
+    dict(type='GetBBoxCenterScale'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.0,
+        scale_factor=[0.75, 1.25],
+        rotate_factor=180),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='TopdownAffine', input_size=(256, 256)),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=0.5)
+        ]),
+    dict(
+        type='GenerateTarget',
+        encoder=dict(
+            type='SimCCLabel',
+            input_size=(256, 256),
+            sigma=(5.66, 5.66),
+            simcc_split_ratio=2.0,
+            normalize=False,
+            use_dark=False)),
+    dict(type='PackPoseInputs')
+]
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CocoWholeBodyHandDataset',
+        data_root='data/coco/',
+        data_mode='topdown',
+        ann_file='annotations/coco_wholebody_train_v1.0.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=[
+            dict(type='LoadImage', file_client_args=dict(backend='disk')),
+            dict(type='GetBBoxCenterScale'),
+            dict(
+                type='RandomBBoxTransform',
+                scale_factor=[0.5, 1.5],
+                rotate_factor=180),
+            dict(type='RandomFlip', direction='horizontal'),
+            dict(type='TopdownAffine', input_size=(256, 256)),
+            dict(type='mmdet.YOLOXHSVRandomAug'),
+            dict(
+                type='Albumentation',
+                transforms=[
+                    dict(type='Blur', p=0.1),
+                    dict(type='MedianBlur', p=0.1),
+                    dict(
+                        type='CoarseDropout',
+                        max_holes=1,
+                        max_height=0.4,
+                        max_width=0.4,
+                        min_holes=1,
+                        min_height=0.2,
+                        min_width=0.2,
+                        p=1.0)
+                ]),
+            dict(
+                type='GenerateTarget',
+                encoder=dict(
+                    type='SimCCLabel',
+                    input_size=(256, 256),
+                    sigma=(5.66, 5.66),
+                    simcc_split_ratio=2.0,
+                    normalize=False,
+                    use_dark=False)),
+            dict(type='PackPoseInputs')
+        ]))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoWholeBodyHandDataset',
+        data_root='data/coco/',
+        data_mode='topdown',
+        ann_file='annotations/coco_wholebody_val_v1.0.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImage', file_client_args=dict(backend='disk')),
+            dict(type='GetBBoxCenterScale'),
+            dict(type='TopdownAffine', input_size=(256, 256)),
+            dict(type='PackPoseInputs')
+        ]))
+test_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoWholeBodyHandDataset',
+        data_root='data/coco/',
+        data_mode='topdown',
+        ann_file='annotations/coco_wholebody_val_v1.0.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImage', file_client_args=dict(backend='disk')),
+            dict(type='GetBBoxCenterScale'),
+            dict(type='TopdownAffine', input_size=(256, 256)),
+            dict(type='PackPoseInputs')
+        ]))
+val_evaluator = [
+    dict(type='PCKAccuracy', thr=0.2),
+    dict(type='AUC'),
+    dict(type='EPE')
+]
+test_evaluator = [
+    dict(type='PCKAccuracy', thr=0.2),
+    dict(type='AUC'),
+    dict(type='EPE')
+]
diff --git a/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py b/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..be327212f7fd6eceee601818790fd6343daabba0
--- /dev/null
+++ b/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py
@@ -0,0 +1,113 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+hand_layout = dict(
+    num_node=17,
+    inward=[(15, 13), (13, 11), (16, 14), (14, 12), (11, 5), (12, 6), (9, 7),
+            (7, 5), (10, 8), (8, 6), (5, 0), (6, 0), (1, 0), (3, 1), (2, 0),
+            (4, 2)],
+    center=0)
+
+model = dict(
+    type='RecognizerGCN',
+    backbone=dict(
+        type='STGCN',
+        gcn_adaptive='init',
+        gcn_with_res=True,
+        tcn_type='mstcn',
+        graph_cfg=dict(layout=hand_layout, mode='spatial')),
+    cls_head=dict(type='GCNHead', num_classes=27, in_channels=256))
+
+dataset_type = 'PoseDataset'
+ann_file = 'jester.pkl'
+train_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(type='UniformSampleFrames', clip_len=100),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=1),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=1),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=10,
+        test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=1),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            pipeline=train_pipeline,
+            split='train')))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=val_pipeline,
+        split='val',
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=test_pipeline,
+        split='val',
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0,
+        T_max=16,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/gesture_recognition/demo/hand_det.jpg b/projects/gesture_recognition/demo/hand_det.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c12616fc493b050782bdf5602b40c876e23fa877
Binary files /dev/null and b/projects/gesture_recognition/demo/hand_det.jpg differ
diff --git a/projects/gesture_recognition/demo/hand_det_out.jpg b/projects/gesture_recognition/demo/hand_det_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2f3c1bed417f63bf0692a63cf4d4b722bca33590
Binary files /dev/null and b/projects/gesture_recognition/demo/hand_det_out.jpg differ
diff --git a/projects/gesture_recognition/extract_keypoint.py b/projects/gesture_recognition/extract_keypoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1476fbb6595801c76cf5aadba309de5a93edae9c
--- /dev/null
+++ b/projects/gesture_recognition/extract_keypoint.py
@@ -0,0 +1,115 @@
+import copy
+import os
+import pickle
+import time
+from argparse import ArgumentParser
+
+import cv2
+import numpy as np
+import torch
+from mmdet.apis import init_detector
+from mmengine.dataset import Compose, pseudo_collate
+from mmengine.registry import init_default_scope
+from mmpose.apis import init_model
+from PIL import Image
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('root', help='Video folder root')
+    parser.add_argument('--pose_config', help='Pose config file')
+    parser.add_argument('--pose_ckpt', help='Pose checkpoint file')
+    parser.add_argument('--det_config', help='Hand detection config file')
+    parser.add_argument('--det_ckpt', help='Hand detection checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    args = parser.parse_args()
+    return args
+
+
+@torch.no_grad()
+def inference_topdown(model, pose_pipeline, det_model, det_pipeline, folder):
+
+    img_paths = [f'{folder}/{img}' for img in os.listdir(folder)]
+
+    w, h = Image.open(img_paths[0]).size
+    bbox0 = np.array([[0, 0, w, h]], dtype=np.float32)
+
+    imgs = [cv2.imread(img_path) for img_path in img_paths]
+
+    data_list = [
+        dict(img=copy.deepcopy(img), img_id=idx)
+        for idx, img in enumerate(imgs)
+    ]
+    data_list = [det_pipeline(data_info) for data_info in data_list]
+    batch = pseudo_collate(data_list)
+    bbox_results = det_model.test_step(batch)
+    bboxes = [i.pred_instances.bboxes[:1].cpu().numpy() for i in bbox_results]
+    scores = []
+    for i in bbox_results:
+        try:
+            score = i.pred_instances.scores[0].item()
+        except Exception as ex:
+            print(ex)
+            score = 0
+        scores.append(score)
+    data_list = []
+    for img, bbox, score in zip(imgs, bboxes, scores):
+        data_info = dict(img=img)
+        if bbox.shape == bbox0.shape and score > 0.3:
+            if score > 0.5:
+                data_info['bbox'] = bbox
+            else:
+                w = (score - 0.1) / 0.4
+                data_info['bbox'] = w * bbox + (1 - w) * bbox0
+        else:
+            data_info['bbox'] = bbox0
+        data_info['bbox_score'] = np.ones(1, dtype=np.float32)  # shape (1,)
+        data_info.update(model.dataset_meta)
+        data_list.append(pose_pipeline(data_info))
+
+    batch = pseudo_collate(data_list)
+    results = model.test_step(batch)
+
+    lookup = {}
+    for img_path, result in zip(img_paths, results):
+        keypoints = result.pred_instances.keypoints
+        scores = result.pred_instances.keypoint_scores
+        lookup[img_path] = (keypoints, scores, (w, h))
+    return lookup
+
+
+def main():
+    args = parse_args()
+
+    det_model = init_detector(
+        args.det_config, args.det_ckpt, device=args.device)
+    det_model.cfg.test_dataloader.dataset.pipeline[
+        0].type = 'mmdet.LoadImageFromNDArray'
+    det_pipeline = Compose(det_model.cfg.test_dataloader.dataset.pipeline)
+
+    model = init_model(
+        args.pose_config, args.pose_checkpoint, device=args.device)
+    init_default_scope(model.cfg.get('default_scope', 'mmpose'))
+
+    folders = [f'{args.root}/{folder}' for folder in os.listdir(args.root)]
+
+    pose_pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline)
+    # inference a single image
+    lookup = {}
+    L = len(folders)
+    t = time.time()
+    for idx, folder in enumerate(folders):
+        results = inference_topdown(model, pose_pipeline, det_model,
+                                    det_pipeline, folder)
+        lookup.update(results)
+        if idx % 100 == 99:
+            eta = (time.time() - t) / (idx + 1) * (L - idx) / 3600
+            print('Require %.2f hours' % eta)
+
+    with open('jester.pkl', 'wb') as f:
+        pickle.dump(lookup, f)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/projects/gesture_recognition/parse_pose.py b/projects/gesture_recognition/parse_pose.py
new file mode 100644
index 0000000000000000000000000000000000000000..161be9261fe8f3f4e11beb49e0a9b764c3e18508
--- /dev/null
+++ b/projects/gesture_recognition/parse_pose.py
@@ -0,0 +1,179 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import os
+
+import numpy as np
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert and merge hand pose dataset to COCO style')
+    parser.add_argument(
+        '--data_root',
+        type=str,
+        default='./data/',
+        help='the root to all involved datasets')
+    parser.add_argument(
+        '--out_anno_prefix',
+        type=str,
+        default='hand_det',
+        help='the prefix of output annotation files')
+
+    args = parser.parse_args()
+    return args
+
+
+def get_data_root(path):
+    path = path.split('/')
+    index = path.index('annotations') - 1
+    root = path[index]
+    if root == 'halpe':
+        root = 'halpe/hico_20160224_det/images/train2015/'
+    return root
+
+
+def parse_coco_style(file_path, anno_idx=0):
+    with open(file_path) as f:
+        contents = json.load(f)
+
+    data_root = get_data_root(file_path) + '/'
+    images = contents['images']
+    annos = contents['annotations']
+    images_out, annos_out = [], []
+    for img, anno in zip(images, annos):
+        assert img['id'] == anno['image_id']
+        img_out = dict(
+            file_name=data_root + img['file_name'],
+            height=img['height'],
+            width=img['width'],
+            id=anno_idx)
+        anno_out = dict(
+            area=anno['area'],
+            iscrowd=anno['iscrowd'],
+            image_id=anno_idx,
+            bbox=anno['bbox'],
+            category_id=0,
+            id=anno_idx)
+        anno_idx += 1
+        images_out.append(img_out)
+        annos_out.append(anno_out)
+    return images_out, annos_out, anno_idx
+
+
+def parse_halpe(file_path, anno_idx):
+
+    def get_bbox(keypoints):
+        """Get bbox from keypoints."""
+        if len(keypoints) == 0:
+            return [0, 0, 0, 0]
+        x1, y1, _ = np.amin(keypoints, axis=0)
+        x2, y2, _ = np.amax(keypoints, axis=0)
+        w, h = x2 - x1, y2 - y1
+        return [x1, y1, w, h]
+
+    with open(file_path) as f:
+        contents = json.load(f)
+
+    data_root = get_data_root(file_path) + '/'
+    images = contents['images']
+    annos = contents['annotations']
+    images_out, annos_out = [], []
+    for img, anno in zip(images, annos):
+        assert img['id'] == anno['image_id']
+        keypoints = np.array(anno['keypoints']).reshape(-1, 3)
+        lefthand_kpts = keypoints[-42:-21, :]
+        righthand_kpts = keypoints[-21:, :]
+
+        left_mask = lefthand_kpts[:, 2] > 0
+        right_mask = righthand_kpts[:, 2] > 0
+        lefthand_box = get_bbox(lefthand_kpts[left_mask])
+        righthand_box = get_bbox(righthand_kpts[right_mask])
+
+        if max(lefthand_box) > 0:
+            img_out = dict(
+                file_name=data_root + img['file_name'],
+                height=img['height'],
+                width=img['width'],
+                id=anno_idx)
+            anno_out = dict(
+                area=lefthand_box[2] * lefthand_box[3],
+                iscrowd=anno['iscrowd'],
+                image_id=anno_idx,
+                bbox=lefthand_box,
+                category_id=0,
+                id=anno_idx)
+            anno_idx += 1
+            images_out.append(img_out)
+            annos_out.append(anno_out)
+
+        if max(righthand_box) > 0:
+            img_out = dict(
+                file_name=data_root + img['file_name'],
+                height=img['height'],
+                width=img['width'],
+                id=anno_idx)
+            anno_out = dict(
+                area=righthand_box[2] * righthand_box[3],
+                iscrowd=anno['iscrowd'],
+                image_id=anno_idx,
+                bbox=righthand_box,
+                category_id=0,
+                id=anno_idx)
+            anno_idx += 1
+            images_out.append(img_out)
+            annos_out.append(anno_out)
+    return images_out, annos_out, anno_idx
+
+
+train_files = [
+    'freihand/annotations/freihand_train.json',
+    'halpe/annotations/halpe_train_v1.json',
+    'onehand10k/annotations/onehand10k_train.json',
+    '/rhd/annotations/rhd_train.json'
+]
+
+val_files = ['onehand10k/annotations/onehand10k_test.json']
+
+
+def convert2dict(data_root, anno_files):
+    anno_files = [data_root + _ for _ in anno_files]
+
+    images, annos, anno_idx = [], [], 0
+    for anno_file in anno_files:
+        if 'freihand' in anno_file or 'onehand10k' in anno_file \
+                                   or 'rhd' in anno_file:
+            images_out, annos_out, anno_idx = parse_coco_style(
+                anno_file, anno_idx)
+            images += images_out
+            annos += annos_out
+        elif 'halpe' in anno_file:
+            images_out, annos_out, anno_idx = parse_halpe(anno_file, anno_idx)
+            images += images_out
+            annos += annos_out
+        else:
+            print(f'{anno_file} not supported')
+
+    result = dict(
+        images=images,
+        annotations=annos,
+        categories=[{
+            'id': 0,
+            'name': 'hand'
+        }])
+    return result
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    data_root = args.data_root + '/'
+    prefix = args.out_anno_prefix
+    os.makedirs('hand_det', exist_ok=True)
+
+    result = convert2dict(data_root, train_files)
+    with open(f'hand_det/{prefix}_train.json', 'w') as f:
+        json.dump(result, f)
+
+    result = convert2dict(data_root, val_files)
+    with open(f'hand_det/{prefix}_val.json', 'w') as f:
+        json.dump(result, f)
diff --git a/projects/knowledge_distillation/README.md b/projects/knowledge_distillation/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..be34cfd80432a5239ccf16a951d77de7ee491ad3
--- /dev/null
+++ b/projects/knowledge_distillation/README.md
@@ -0,0 +1,132 @@
+# Knowledge Distillation Based on MMRazor
+
+Knowledge Distillation is a classic model compression method. The core idea is to "imitate" a teacher model (or multi-model ensemble) with better performance and more complex structure by guiding a lightweight student model, improving the performance of the student model without changing its structure. [MMRazor](https://github.com/open-mmlab/mmrazor) is a model compression toolkit for model slimming and AutoML, which supports several KD algorithms. In this project, we take TSM-MobileNetV2 as an example to show how to use MMRazor to perform knowledge distillation on action recognition models. You could refer to more [MMRazor](https://github.com/open-mmlab/mmrazor) for more model compression algorithms.
+
+## Description
+
+This is an implementation of MMRazor Knowledge Distillation Application, we provide action recognition configs and models for MMRazor.
+
+## Usage
+
+### Prerequisites
+
+- [MMRazor v1.0.0](https://github.com/open-mmlab/mmrazor/tree/v1.0.0) or higher
+
+There are two install modes:
+
+Option (a). Install as a Python package
+
+```shell
+mim install "mmrazor>=1.0.0"
+```
+
+Option (b). Install from source
+
+```shell
+git clone https://github.com/open-mmlab/mmrazor.git
+cd mmrazor
+pip install -v -e .
+```
+
+### Setup Environment
+
+Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2.
+
+At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+### Data Preparation
+
+Prepare the Kinetics400 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics/README.md).
+
+Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link.
+
+```shell
+ln -s ../../data ./data
+```
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmrazor configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmrazor configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmrazor configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+Please convert the knowledge distillation checkpoint to student-only checkpoint with following commands, you will get a checkpoint with a '\_student.pth' suffix under the same directory as the original checkpoint. Then take the student-only checkpoint for testing.
+
+```bash
+mim run mmrazor convert_kd_ckpt_to_student $CHECKPOINT
+```
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results and models
+
+| Location |   Dataset    |    Teacher     |      Student      |     Acc     | Acc(T) | Acc(S) |        Config         | Download                                                                      |
+| :------: | :----------: | :------------: | :---------------: | :---------: | :----: | :----: | :-------------------: | :---------------------------------------------------------------------------- |
+|  logits  | Kinetics-400 | [TSM-ResNet50] | [TSM-MobileNetV2] | 69.60(+0.9) | 73.22  | 68.71  | [config][distill_tsm] | [teacher][tsm_r50_pth] \| [model][distill_pth_tsm] \| [log][distill_log_tsm]  |
+|  logits  | Kinetics-400 |   [TSN-Swin]   |  [TSN-ResNet50]   | 75.54(+1.4) | 79.22  | 74.12  | [config][distill_tsn] | [teacher][tsn_swin_pth] \| [model][distill_pth_tsn] \| [log][distill_log_tsn] |
+
+## Citation
+
+```latex
+@article{huang2022knowledge,
+  title={Knowledge Distillation from A Stronger Teacher},
+  author={Huang, Tao and You, Shan and Wang, Fei and Qian, Chen and Xu, Chang},
+  journal={arXiv preprint arXiv:2205.10536},
+  year={2022}
+}
+```
+
+[distill_log_tsm]: https://download.openmmlab.com/mmaction/v1.0/projects/knowledge_distillation/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.log
+[distill_log_tsn]: https://download.openmmlab.com/mmaction/v1.0/projects/knowledge_distillation/kd_logits_tsn-swin_tsn-r50_1x1x8_k400/kd_logits_tsn-swin_tsn-r50_1x1x8_k400.log
+[distill_pth_tsm]: https://download.openmmlab.com/mmaction/v1.0/projects/knowledge_distillation/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400_20230517-c3e8aa0d.pth
+[distill_pth_tsn]: https://download.openmmlab.com/mmaction/v1.0/projects/knowledge_distillation/kd_logits_tsn-swin_tsn-r50_1x1x8_k400/kd_logits_tsn-swin_tsn-r50_1x1x8_k400_student_20230530-f938d404.pth
+[distill_tsm]: configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py
+[distill_tsn]: configs/kd_logits_tsn-swin_tsn-r50_8xb16_k400.py
+[tsm-mobilenetv2]: ../../configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py
+[tsm-resnet50]: ../../configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py
+[tsm_r50_pth]: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb_20220831-a6db1e5d.pth
+[tsn-resnet50]: ../../configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py
+[tsn-swin]: ../../configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py
+[tsn_swin_pth]: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb_20230530-428f0064.pth
diff --git a/projects/knowledge_distillation/configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py b/projects/knowledge_distillation/configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py
new file mode 100644
index 0000000000000000000000000000000000000000..3232c4bb5e6bbb064d19bb37e8fbad7ee56843a1
--- /dev/null
+++ b/projects/knowledge_distillation/configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py
@@ -0,0 +1,36 @@
+_base_ = 'mmaction::recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py'  # noqa: E501
+
+teacher_ckpt = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb_20220831-64d69186.pth'  # noqa: E501
+model = dict(
+    _delete_=True,
+    _scope_='mmrazor',
+    type='SingleTeacherDistill',
+    architecture=dict(
+        cfg_path=  # noqa: E251
+        'mmaction::recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py',  # noqa: E501
+        pretrained=False),
+    teacher=dict(
+        cfg_path=  # noqa: E251
+        'mmaction::recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py',  # noqa: E501
+        pretrained=False),
+    teacher_ckpt=teacher_ckpt,
+    distiller=dict(
+        type='ConfigurableDistiller',
+        student_recorders=dict(
+            logits=dict(type='ModuleOutputs', source='cls_head.fc_cls')),
+        teacher_recorders=dict(
+            logits=dict(type='ModuleOutputs', source='cls_head.fc_cls')),
+        distill_losses=dict(
+            loss_dist=dict(
+                type='DISTLoss',
+                inter_loss_weight=1.0,
+                intra_loss_weight=1.0,
+                tau=1,
+                loss_weight=1,
+            )),
+        loss_forward_mappings=dict(
+            loss_dist=dict(
+                logits_S=dict(from_student=True, recorder='logits'),
+                logits_T=dict(from_student=False, recorder='logits')))))
+
+val_cfg = dict(_delete_=True, type='mmrazor.SingleTeacherDistillValLoop')
diff --git a/projects/knowledge_distillation/configs/kd_logits_tsn-swin_tsn-r50_8xb16_k400.py b/projects/knowledge_distillation/configs/kd_logits_tsn-swin_tsn-r50_8xb16_k400.py
new file mode 100644
index 0000000000000000000000000000000000000000..924c1f84a98af8e26deabb13160cfaf30b06c6a8
--- /dev/null
+++ b/projects/knowledge_distillation/configs/kd_logits_tsn-swin_tsn-r50_8xb16_k400.py
@@ -0,0 +1,38 @@
+_base_ = 'mmaction::recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py'  # noqa: E501
+
+teacher_ckpt = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb_20230530-428f0064.pth'  # noqa: E501
+
+model = dict(
+    _delete_=True,
+    _scope_='mmrazor',
+    type='SingleTeacherDistill',
+    architecture=dict(
+        cfg_path=  # noqa: E251
+        'mmaction::recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py',  # noqa: E501
+        backbone=dict(pretrained=False),
+        pretrained=False),
+    teacher=dict(
+        cfg_path=  # noqa: E251
+        'mmaction::recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py',  # noqa: E501
+        pretrained=False),
+    teacher_ckpt=teacher_ckpt,
+    distiller=dict(
+        type='ConfigurableDistiller',
+        student_recorders=dict(
+            logits=dict(type='ModuleOutputs', source='cls_head.fc_cls')),
+        teacher_recorders=dict(
+            logits=dict(type='ModuleOutputs', source='cls_head.fc_cls')),
+        distill_losses=dict(
+            loss_dist=dict(
+                type='DISTLoss',
+                inter_loss_weight=1.0,
+                intra_loss_weight=1.0,
+                tau=1,
+                loss_weight=4,
+            )),
+        loss_forward_mappings=dict(
+            loss_dist=dict(
+                logits_S=dict(from_student=True, recorder='logits'),
+                logits_T=dict(from_student=False, recorder='logits')))))
+
+val_cfg = dict(_delete_=True, type='mmrazor.SingleTeacherDistillValLoop')
diff --git a/projects/msg3d/README.md b/projects/msg3d/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..56b9b08b1ff971ab9f46e630e26568631ba3427d
--- /dev/null
+++ b/projects/msg3d/README.md
@@ -0,0 +1,117 @@
+# MSG3D Project
+
+[Disentangling and Unifying Graph Convolutions for Skeleton-Based Action Recognition](https://arxiv.org/abs/2003.14111)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Spatial-temporal graphs have been widely used by skeleton-based action recognition algorithms to model human action dynamics. To capture robust movement patterns from these graphs, long-range and multi-scale context aggregation and spatial-temporal dependency modeling are critical aspects of a powerful feature extractor. However, existing methods have limitations in achieving (1) unbiased long-range joint relationship modeling under multi-scale operators and (2) unobstructed cross-spacetime information flow for capturing complex spatial-temporal dependencies. In this work, we present (1) a simple method to disentangle multi-scale graph convolutions and (2) a unified spatial-temporal graph convolutional operator named G3D. The proposed multi-scale aggregation scheme disentangles the importance of nodes in different neighborhoods for effective long-range modeling. The proposed G3D module leverages dense cross-spacetime edges as skip connections for direct information propagation across the spatial-temporal graph. By coupling these proposals, we develop a powerful feature extractor named MS-G3D based on which our model outperforms previous state-of-the-art methods on three large-scale datasets: NTU RGB+D 60, NTU RGB+D 120, and Kinetics Skeleton 400.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/58767402/223127347-135bb92b-2dee-46d9-95fc-cebf65c27fc8.png" width="800"/>
+</div>
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Installation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2.
+
+Assume that you are located at `$MMACTION2/projects/msg3d`.
+
+Add the current folder to `PYTHONPATH`, so that Python can find your code. Run the following command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md).
+
+Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link.
+
+```shell
+ln -s ../../data ./data
+```
+
+### Data Preparation
+
+Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md).
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+### NTU60_XSub_2D
+
+| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol |                     config                     |                     ckpt                     |                     log                     |
+| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+|       uniform 100       |  joint   |  8   |  MSG3D   |   92.3   |     10 clips     | [config](./configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20230309-73b97296.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) |
+
+### NTU60_XSub_3D
+
+| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol |                     config                     |                     ckpt                     |                     log                     |
+| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+|       uniform 100       |  joint   |  8   |  MSG3D   |   89.6   |     10 clips     | [config](./configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20230308-c325d222.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) |
+
+## Citation
+
+<!-- Replace to the citation of the paper your project refers to. -->
+
+```bibtex
+@inproceedings{liu2020disentangling,
+  title={Disentangling and unifying graph convolutions for skeleton-based action recognition},
+  author={Liu, Ziyu and Zhang, Hongwen and Chen, Zhenghao and Wang, Zhiyong and Ouyang, Wanli},
+  booktitle={CVPR},
+  pages={143--152},
+  year={2020}
+}
+```
diff --git a/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fa483e0e9dcc3e2b62a9fa8c2ce5d9e35e26bd5
--- /dev/null
+++ b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
@@ -0,0 +1,104 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+model = dict(
+    type='RecognizerGCN',
+    backbone=dict(
+        type='MSG3D', graph_cfg=dict(layout='coco', mode='binary_adj')),
+    cls_head=dict(type='GCNHead', num_classes=60, in_channels=384))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+train_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(type='UniformSampleFrames', clip_len=100),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=10,
+        test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            pipeline=train_pipeline,
+            split='xsub_train')))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=val_pipeline,
+        split='xsub_val',
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=test_pipeline,
+        split='xsub_val',
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0,
+        T_max=16,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..07a135edf36e1f6cc2ceb6d85db568eb106e75a5
--- /dev/null
+++ b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
@@ -0,0 +1,104 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+model = dict(
+    type='RecognizerGCN',
+    backbone=dict(
+        type='MSG3D', graph_cfg=dict(layout='nturgb+d', mode='binary_adj')),
+    cls_head=dict(type='GCNHead', num_classes=60, in_channels=384))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_3d.pkl'
+train_pipeline = [
+    dict(type='PreNormalize3D'),
+    dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+    dict(type='UniformSampleFrames', clip_len=100),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='PreNormalize3D'),
+    dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='PreNormalize3D'),
+    dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=10,
+        test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            pipeline=train_pipeline,
+            split='xsub_train')))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=val_pipeline,
+        split='xsub_val',
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=test_pipeline,
+        split='xsub_val',
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0,
+        T_max=16,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/msg3d/models/__init__.py b/projects/msg3d/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f37df2570baec9615a97e89a31d75a9e299f389e
--- /dev/null
+++ b/projects/msg3d/models/__init__.py
@@ -0,0 +1,3 @@
+from .msg3d import MSG3D
+
+__all__ = ['MSG3D']
diff --git a/projects/msg3d/models/msg3d.py b/projects/msg3d/models/msg3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..421529378a9b6ac0a86b4171daf3422239f45f90
--- /dev/null
+++ b/projects/msg3d/models/msg3d.py
@@ -0,0 +1,75 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule, Sequential
+
+from mmaction.models.utils import Graph
+from mmaction.registry import MODELS
+from .msg3d_utils import MSGCN, MSTCN, MW_MSG3DBlock
+
+
+@MODELS.register_module()
+class MSG3D(BaseModule):
+
+    def __init__(self,
+                 graph_cfg,
+                 in_channels=3,
+                 base_channels=96,
+                 num_gcn_scales=13,
+                 num_g3d_scales=6,
+                 num_person=2,
+                 tcn_dropout=0):
+        super().__init__()
+
+        self.graph = Graph(**graph_cfg)
+        # Note that A is a 2D tensor
+        A = torch.tensor(
+            self.graph.A[0], dtype=torch.float32, requires_grad=False)
+        self.register_buffer('A', A)
+        self.num_point = A.shape[-1]
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+
+        self.data_bn = nn.BatchNorm1d(self.num_point * in_channels *
+                                      num_person)
+        c1, c2, c3 = base_channels, base_channels * 2, base_channels * 4
+
+        # r=3 STGC blocks
+        self.gcn3d1 = MW_MSG3DBlock(3, c1, A, num_g3d_scales, window_stride=1)
+        self.sgcn1 = Sequential(
+            MSGCN(num_gcn_scales, 3, c1, A), MSTCN(c1, c1), MSTCN(c1, c1))
+        self.sgcn1[-1].act = nn.Identity()
+        self.tcn1 = MSTCN(c1, c1, tcn_dropout=tcn_dropout)
+
+        self.gcn3d2 = MW_MSG3DBlock(c1, c2, A, num_g3d_scales, window_stride=2)
+        self.sgcn2 = Sequential(
+            MSGCN(num_gcn_scales, c1, c1, A), MSTCN(c1, c2, stride=2),
+            MSTCN(c2, c2))
+        self.sgcn2[-1].act = nn.Identity()
+        self.tcn2 = MSTCN(c2, c2, tcn_dropout=tcn_dropout)
+
+        self.gcn3d3 = MW_MSG3DBlock(c2, c3, A, num_g3d_scales, window_stride=2)
+        self.sgcn3 = Sequential(
+            MSGCN(num_gcn_scales, c2, c2, A), MSTCN(c2, c3, stride=2),
+            MSTCN(c3, c3))
+        self.sgcn3[-1].act = nn.Identity()
+        self.tcn3 = MSTCN(c3, c3, tcn_dropout=tcn_dropout)
+
+    def forward(self, x):
+        N, M, T, V, C = x.size()
+        x = x.permute(0, 1, 3, 4, 2).contiguous().reshape(N, M * V * C, T)
+        x = self.data_bn(x)
+        x = x.reshape(N * M, V, C, T).permute(0, 2, 3, 1).contiguous()
+
+        # Apply activation to the sum of the pathways
+        x = F.relu(self.sgcn1(x) + self.gcn3d1(x), inplace=True)
+        x = self.tcn1(x)
+
+        x = F.relu(self.sgcn2(x) + self.gcn3d2(x), inplace=True)
+        x = self.tcn2(x)
+
+        x = F.relu(self.sgcn3(x) + self.gcn3d3(x), inplace=True)
+        x = self.tcn3(x)
+
+        # N * M, C, T, V
+        return x.reshape((N, M) + x.shape[1:])
diff --git a/projects/msg3d/models/msg3d_utils.py b/projects/msg3d/models/msg3d_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9aac57ad612fae3ea01578cff18bf554aa48f9d
--- /dev/null
+++ b/projects/msg3d/models/msg3d_utils.py
@@ -0,0 +1,342 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+
+from mmaction.models.utils import unit_tcn
+from mmaction.models.utils.graph import k_adjacency, normalize_digraph
+
+
+class MLP(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 act_cfg=dict(type='ReLU'),
+                 dropout=0):
+        super().__init__()
+        channels = [in_channels] + out_channels
+        self.layers = ModuleList()
+        for i in range(1, len(channels)):
+            if dropout > 1e-3:
+                self.layers.append(nn.Dropout(p=dropout))
+            self.layers.append(
+                nn.Conv2d(channels[i - 1], channels[i], kernel_size=1))
+            self.layers.append(nn.BatchNorm2d(channels[i]))
+            if act_cfg:
+                self.layers.append(build_activation_layer(act_cfg))
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+class MSGCN(BaseModule):
+
+    def __init__(self,
+                 num_scales,
+                 in_channels,
+                 out_channels,
+                 A,
+                 dropout=0,
+                 act_cfg=dict(type='ReLU')):
+        super().__init__()
+        self.num_scales = num_scales
+
+        A_powers = [
+            k_adjacency(A, k, with_self=True) for k in range(num_scales)
+        ]
+        A_powers = np.stack([normalize_digraph(g) for g in A_powers])
+
+        # K, V, V
+        self.register_buffer('A', torch.Tensor(A_powers))
+        self.PA = nn.Parameter(self.A.clone())
+        nn.init.uniform_(self.PA, -1e-6, 1e-6)
+
+        self.mlp = MLP(
+            in_channels * num_scales, [out_channels],
+            dropout=dropout,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        N, C, T, V = x.shape
+        A = self.A
+        A = A + self.PA
+
+        support = torch.einsum('kvu,nctv->nkctu', A, x)
+        support = support.reshape(N, self.num_scales * C, T, V)
+        out = self.mlp(support)
+        return out
+
+
+# ! Notice: The implementation of MSTCN in
+# MS-G3D is not the same as our implementation.
+class MSTCN(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 dilations=[1, 2, 3, 4],
+                 residual=True,
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=[
+                     dict(type='Constant', layer='BatchNorm2d', val=1),
+                     dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+                 ],
+                 tcn_dropout=0):
+
+        super().__init__(init_cfg=init_cfg)
+        # Multiple branches of temporal convolution
+        self.num_branches = len(dilations) + 2
+        branch_channels = out_channels // self.num_branches
+        branch_channels_rem = out_channels - branch_channels * (
+            self.num_branches - 1)
+
+        if type(kernel_size) == list:
+            assert len(kernel_size) == len(dilations)
+        else:
+            kernel_size = [kernel_size] * len(dilations)
+
+        self.branches = ModuleList([
+            Sequential(
+                nn.Conv2d(
+                    in_channels, branch_channels, kernel_size=1, padding=0),
+                nn.BatchNorm2d(branch_channels),
+                build_activation_layer(act_cfg),
+                unit_tcn(
+                    branch_channels,
+                    branch_channels,
+                    kernel_size=ks,
+                    stride=stride,
+                    dilation=dilation),
+            ) for ks, dilation in zip(kernel_size, dilations)
+        ])
+
+        # Additional Max & 1x1 branch
+        self.branches.append(
+            Sequential(
+                nn.Conv2d(
+                    in_channels, branch_channels, kernel_size=1, padding=0),
+                nn.BatchNorm2d(branch_channels),
+                build_activation_layer(act_cfg),
+                nn.MaxPool2d(
+                    kernel_size=(3, 1), stride=(stride, 1), padding=(1, 0)),
+                nn.BatchNorm2d(branch_channels)))
+
+        self.branches.append(
+            Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    branch_channels_rem,
+                    kernel_size=1,
+                    padding=0,
+                    stride=(stride, 1)), nn.BatchNorm2d(branch_channels_rem)))
+
+        # Residual connection
+        if not residual:
+            self.residual = lambda x: 0
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+        else:
+            self.residual = unit_tcn(
+                in_channels, out_channels, kernel_size=1, stride=stride)
+
+        self.act = build_activation_layer(act_cfg)
+        self.drop = nn.Dropout(tcn_dropout)
+
+    def forward(self, x):
+        # Input dim: (N,C,T,V)
+        res = self.residual(x)
+        branch_outs = []
+        for tempconv in self.branches:
+            out = tempconv(x)
+            branch_outs.append(out)
+
+        out = torch.cat(branch_outs, dim=1)
+        out += res
+        out = self.act(out)
+        out = self.drop(out)
+        return out
+
+
+class UnfoldTemporalWindows(BaseModule):
+
+    def __init__(self, window_size, window_stride, window_dilation=1):
+        super().__init__()
+        self.window_size = window_size
+        self.window_stride = window_stride
+        self.window_dilation = window_dilation
+
+        self.padding = (window_size + (window_size - 1) *
+                        (window_dilation - 1) - 1) // 2
+        self.unfold = nn.Unfold(
+            kernel_size=(self.window_size, 1),
+            dilation=(self.window_dilation, 1),
+            stride=(self.window_stride, 1),
+            padding=(self.padding, 0))
+
+    def forward(self, x):
+        # Input shape: (N,C,T,V), out: (N,C,T,V*window_size)
+        N, C, T, V = x.shape
+        x = self.unfold(x)
+        # Permute extra channels from window size to the graph dimension;
+        # -1 for number of windows
+        x = x.reshape(N, C, self.window_size, -1, V).permute(0, 1, 3, 2,
+                                                             4).contiguous()
+        x = x.reshape(N, C, -1, self.window_size * V)
+        return x
+
+
+class ST_MSGCN(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 num_scales,
+                 window_size,
+                 residual=False,
+                 dropout=0,
+                 act_cfg=dict(type='ReLU')):
+
+        super().__init__()
+        self.num_scales = num_scales
+        self.window_size = window_size
+        A = self.build_st_graph(A, window_size)
+
+        A_scales = [
+            k_adjacency(A, k, with_self=True) for k in range(num_scales)
+        ]
+        A_scales = np.stack([normalize_digraph(g) for g in A_scales])
+
+        self.register_buffer('A', torch.Tensor(A_scales))
+        self.V = len(A)
+
+        self.PA = nn.Parameter(self.A.clone())
+        nn.init.uniform_(self.PA, -1e-6, 1e-6)
+
+        self.mlp = MLP(
+            in_channels * num_scales, [out_channels],
+            dropout=dropout,
+            act_cfg=act_cfg)
+
+        # Residual connection
+        if not residual:
+            self.residual = lambda x: 0
+        elif (in_channels == out_channels):
+            self.residual = lambda x: x
+        else:
+            self.residual = MLP(in_channels, [out_channels], act_cfg=None)
+
+        self.act = build_activation_layer(act_cfg)
+
+    def build_st_graph(self, A, window_size):
+        if not isinstance(A, np.ndarray):
+            A = A.data.cpu().numpy()
+
+        assert len(A.shape) == 2 and A.shape[0] == A.shape[1]
+        V = len(A)
+        A_with_I = A + np.eye(V, dtype=A.dtype)
+
+        A_large = np.tile(A_with_I, (window_size, window_size)).copy()
+        return A_large
+
+    def forward(self, x):
+        N, C, T, V = x.shape  # T = number of windows, V = self.V * window_size
+        A = self.A + self.PA
+
+        # Perform Graph Convolution
+        res = self.residual(x)
+        agg = torch.einsum('kvu,nctv->nkctu', A, x)
+        agg = agg.reshape(N, self.num_scales * C, T, V)
+        out = self.mlp(agg)
+        if res == 0:
+            return self.act(out)
+        else:
+            return self.act(out + res)
+
+
+class MSG3DBlock(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 num_scales,
+                 window_size,
+                 window_stride,
+                 window_dilation,
+                 embed_factor=1,
+                 activation='relu'):
+
+        super().__init__()
+        self.window_size = window_size
+        self.out_channels = out_channels
+        self.embed_channels_in = out_channels // embed_factor
+        self.embed_channels_out = out_channels // embed_factor
+        if embed_factor == 1:
+            self.in1x1 = nn.Identity()
+            self.embed_channels_in = self.embed_channels_out = in_channels
+            # The first STGC block changes channels right away;
+            # others change at collapse
+            if in_channels == 3:
+                self.embed_channels_out = out_channels
+        else:
+            self.in1x1 = MLP(in_channels, [self.embed_channels_in])
+
+        self.gcn3d = Sequential(
+            UnfoldTemporalWindows(window_size, window_stride, window_dilation),
+            ST_MSGCN(
+                in_channels=self.embed_channels_in,
+                out_channels=self.embed_channels_out,
+                A=A,
+                num_scales=num_scales,
+                window_size=window_size))
+
+        self.out_conv = nn.Conv3d(
+            self.embed_channels_out,
+            out_channels,
+            kernel_size=(1, self.window_size, 1))
+        self.out_bn = nn.BatchNorm2d(out_channels)
+
+    def forward(self, x):
+        N, _, T, V = x.shape
+        x = self.in1x1(x)
+        # Construct temporal windows and apply MS-GCN
+        x = self.gcn3d(x)
+
+        # Collapse the window dimension
+        x = x.reshape(N, self.embed_channels_out, -1, self.window_size, V)
+        x = self.out_conv(x).squeeze(dim=3)
+        x = self.out_bn(x)
+        # no activation
+        return x
+
+
+class MW_MSG3DBlock(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 num_scales,
+                 window_sizes=[3, 5],
+                 window_stride=1,
+                 window_dilations=[1, 1]):
+
+        super().__init__()
+        self.gcn3d = ModuleList([
+            MSG3DBlock(in_channels, out_channels, A, num_scales, window_size,
+                       window_stride, window_dilation) for window_size,
+            window_dilation in zip(window_sizes, window_dilations)
+        ])
+
+    def forward(self, x):
+        out_sum = 0
+        for gcn3d in self.gcn3d:
+            out_sum += gcn3d(x)
+        return out_sum
diff --git a/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py b/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfe7de00b7d0782930caf96c529ecb63c481647f
--- /dev/null
+++ b/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py
@@ -0,0 +1,254 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+model = dict(
+    type='FasterRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.53, 116.28, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[0.0, 0.0, 0.0, 0.0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0.0, 0.0, 0.0, 0.0],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+backend_args = None
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=[
+            dict(type='LoadImageFromFile', backend_args=None),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                type='RandomChoiceResize',
+                scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                        (1333, 768), (1333, 800)],
+                keep_ratio=True),
+            dict(type='RandomFlip', prob=0.5),
+            dict(type='PackDetInputs')
+        ],
+        backend_args=None))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImageFromFile', backend_args=None),
+            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                type='PackDetInputs',
+                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                           'scale_factor'))
+        ],
+        backend_args=None))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImageFromFile', backend_args=None),
+            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                type='PackDetInputs',
+                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                           'scale_factor'))
+        ],
+        backend_args=None))
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file='data/coco/annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=None)
+test_evaluator = dict(
+    type='CocoMetric',
+    ann_file='data/coco/annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=None)
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+default_scope = 'mmdet'
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='DetVisualizationHook'))
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='DetLocalVisualizer',
+    vis_backends=[dict(type='LocalVisBackend')],
+    name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py b/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py
new file mode 100644
index 0000000000000000000000000000000000000000..39a0da1fefc8a2f3438ae2afc9c264ab9612da07
--- /dev/null
+++ b/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py'
+model = dict(roi_head=dict(bbox_head=dict(num_classes=1)))
+
+# take 2 epochs as an example
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)
+
+# learning rate
+param_scheduler = [
+    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.0050, momentum=0.9, weight_decay=0.0001))
+
+dataset_type = 'CocoDataset'
+# modify metainfo
+metainfo = {
+    'classes': ('person', ),
+    'palette': [
+        (220, 20, 60),
+    ]
+}
+
+# specify metainfo, dataset path
+data_root = 'data/multisports/'
+
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='annotations/multisports_det_anno_train.json',
+        data_prefix=dict(img='rawframes/'),
+        metainfo=metainfo))
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='annotations/multisports_det_anno_val.json',
+        data_prefix=dict(img='rawframes/'),
+        metainfo=metainfo))
+
+test_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='annotations/ms_infer_anno.json',
+        data_prefix=dict(img='rawframes/'),
+        metainfo=metainfo))
+
+# specify annotaition file path, modify metric items
+val_evaluator = dict(
+    ann_file='data/multisports/annotations/multisports_det_anno_val.json',
+    metric_items=['mAP_50', 'AR@100'],
+    iou_thrs=[0.5],
+)
+
+test_evaluator = dict(
+    ann_file='data/multisports/annotations/ms_infer_anno.json',
+    metric_items=['mAP_50', 'AR@100'],
+    iou_thrs=[0.5],
+)
+
+# specify pretrain checkpoint
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501
diff --git a/projects/stad_tutorial/configs/slowonly_k400_multisports.py b/projects/stad_tutorial/configs/slowonly_k400_multisports.py
new file mode 100644
index 0000000000000000000000000000000000000000..af33b683a67b9ace07059d2ac11257408f8ee8d2
--- /dev/null
+++ b/projects/stad_tutorial/configs/slowonly_k400_multisports.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = [
+    'mmaction::detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py'  # noqa: E501
+]
+
+proposal_file_train = 'data/multisports/annotations/multisports_proposals_train.pkl'  # noqa: E501
+proposal_file_val = 'data/multisports/annotations/multisports_proposals_val.pkl'  # noqa: E501
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    dataset=dict(proposal_file=proposal_file_train))
+
+val_dataloader = dict(
+    num_workers=2, dataset=dict(proposal_file=proposal_file_val))
+
+optim_wrapper = dict(optimizer=dict(type='SGD', lr=0.01))
+
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth'  # noqa: E501
diff --git a/projects/stad_tutorial/demo_stad.ipynb b/projects/stad_tutorial/demo_stad.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..feb4e0f8a10a074ed4d3b85a721d021543667368
--- /dev/null
+++ b/projects/stad_tutorial/demo_stad.ipynb
@@ -0,0 +1,4096 @@
+{
+  "cells": [
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MxFBtHQ4ooZh"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/open-mmlab/mmaction2/projects/stad_tutorial/demo_stad.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ff6iCPqqooZp"
+      },
+      "source": [
+        "# Spatio-temporal action detection with MMAction2\n",
+        "Welcome to MMAction2! This is a tutorial on how to use MMAction2 for spatio-temporal action detection. In this tutorial, we will use the MultiSports dataset as an example, and provide a complete step-by-step guide for spatio-temporal action detection, including\n",
+        "- Prepare spatio-temporal action detection dataset\n",
+        "- Train detection model\n",
+        "- Prepare AVA format dataset\n",
+        "- Train spatio-temporal action detection model\n"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xQlffdn7ooZq"
+      },
+      "source": [
+        "## 0. Install MMAction2 and MMDetection"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "4vWjBJI-ooZr",
+        "outputId": "1c852c24-eb40-407d-e1c4-72d4b43385a3"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting openmim\n",
+            "  Downloading openmim-0.3.7-py2.py3-none-any.whl (51 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.3/51.3 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: Click in /usr/local/lib/python3.10/dist-packages (from openmim) (8.1.3)\n",
+            "Collecting colorama (from openmim)\n",
+            "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
+            "Collecting model-index (from openmim)\n",
+            "  Downloading model_index-0.1.11-py3-none-any.whl (34 kB)\n",
+            "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from openmim) (1.5.3)\n",
+            "Requirement already satisfied: pip>=19.3 in /usr/local/lib/python3.10/dist-packages (from openmim) (23.1.2)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from openmim) (2.27.1)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from openmim) (13.3.4)\n",
+            "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from openmim) (0.8.10)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (6.0)\n",
+            "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (3.4.3)\n",
+            "Collecting ordered-set (from model-index->openmim)\n",
+            "  Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)\n",
+            "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2.8.2)\n",
+            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2022.7.1)\n",
+            "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (1.22.4)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (1.26.15)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2022.12.7)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2.0.12)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (3.4)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.14.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->openmim) (0.1.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->openmim) (1.16.0)\n",
+            "Installing collected packages: ordered-set, colorama, model-index, openmim\n",
+            "Successfully installed colorama-0.4.6 model-index-0.1.11 openmim-0.3.7 ordered-set-4.1.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+            "Collecting mmengine\n",
+            "  Downloading mmengine-0.7.4-py3-none-any.whl (374 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m374.3/374.3 kB\u001b[0m \u001b[31m17.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting addict (from mmengine)\n",
+            "  Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine) (3.7.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmengine) (1.22.4)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmengine) (6.0)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine) (13.3.4)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine) (2.3.0)\n",
+            "Collecting yapf (from mmengine)\n",
+            "  Downloading yapf-0.40.0-py3-none-any.whl (250 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.3/250.3 kB\u001b[0m \u001b[31m28.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmengine) (4.7.0.72)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.4.4)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (23.1)\n",
+            "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (8.4.0)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (2.8.2)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.14.0)\n",
+            "Collecting importlib-metadata>=6.6.0 (from yapf->mmengine)\n",
+            "  Downloading importlib_metadata-6.6.0-py3-none-any.whl (22 kB)\n",
+            "Collecting platformdirs>=3.5.1 (from yapf->mmengine)\n",
+            "  Downloading platformdirs-3.5.3-py3-none-any.whl (15 kB)\n",
+            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmengine) (2.0.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmengine) (3.15.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine) (0.1.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine) (1.16.0)\n",
+            "Installing collected packages: addict, platformdirs, importlib-metadata, yapf, mmengine\n",
+            "  Attempting uninstall: platformdirs\n",
+            "    Found existing installation: platformdirs 3.3.0\n",
+            "    Uninstalling platformdirs-3.3.0:\n",
+            "      Successfully uninstalled platformdirs-3.3.0\n",
+            "Successfully installed addict-2.4.0 importlib-metadata-6.6.0 mmengine-0.7.4 platformdirs-3.5.3 yapf-0.40.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+            "Collecting mmcv\n",
+            "  Downloading https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/mmcv-2.0.0-cp310-cp310-manylinux1_x86_64.whl (74.4 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.4/74.4 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv) (2.4.0)\n",
+            "Requirement already satisfied: mmengine>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from mmcv) (0.7.4)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmcv) (1.22.4)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv) (23.1)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv) (8.4.0)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv) (6.0)\n",
+            "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv) (0.40.0)\n",
+            "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv) (4.7.0.72)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (3.7.1)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (13.3.4)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (2.3.0)\n",
+            "Requirement already satisfied: importlib-metadata>=6.6.0 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (6.6.0)\n",
+            "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (3.5.3)\n",
+            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (2.0.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv) (3.15.0)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (1.4.4)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (2.8.2)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv) (2.14.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine>=0.2.0->mmcv) (0.1.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine>=0.2.0->mmcv) (1.16.0)\n",
+            "Installing collected packages: mmcv\n",
+            "Successfully installed mmcv-2.0.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+            "Collecting mmdet\n",
+            "  Downloading mmdet-3.0.0-py3-none-any.whl (1.7 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m17.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmdet) (3.7.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.22.4)\n",
+            "Requirement already satisfied: pycocotools in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.6)\n",
+            "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.10.1)\n",
+            "Requirement already satisfied: shapely in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.1)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.16.0)\n",
+            "Collecting terminaltables (from mmdet)\n",
+            "  Downloading terminaltables-3.1.10-py2.py3-none-any.whl (15 kB)\n",
+            "Requirement already satisfied: mmcv<2.1.0,>=2.0.0rc4 in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.0)\n",
+            "Requirement already satisfied: mmengine<1.0.0,>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from mmdet) (0.7.4)\n",
+            "Requirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (2.4.0)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (23.1)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (8.4.0)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (6.0)\n",
+            "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (0.40.0)\n",
+            "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (4.7.0.72)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine<1.0.0,>=0.7.1->mmdet) (13.3.4)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine<1.0.0,>=0.7.1->mmdet) (2.3.0)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (1.4.4)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (2.8.2)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine<1.0.0,>=0.7.1->mmdet) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine<1.0.0,>=0.7.1->mmdet) (2.14.0)\n",
+            "Requirement already satisfied: importlib-metadata>=6.6.0 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (6.6.0)\n",
+            "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (3.5.3)\n",
+            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (2.0.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (3.15.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine<1.0.0,>=0.7.1->mmdet) (0.1.2)\n",
+            "Installing collected packages: terminaltables, mmdet\n",
+            "Successfully installed mmdet-3.0.0 terminaltables-3.1.10\n",
+            "Cloning into 'mmaction2'...\n",
+            "remote: Enumerating objects: 22869, done.\u001b[K\n",
+            "remote: Counting objects: 100% (1491/1491), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (800/800), done.\u001b[K\n",
+            "remote: Total 22869 (delta 855), reused 1176 (delta 686), pack-reused 21378\u001b[K\n",
+            "Receiving objects: 100% (22869/22869), 82.81 MiB | 15.42 MiB/s, done.\n",
+            "Resolving deltas: 100% (15954/15954), done.\n",
+            "/content/mmaction2\n",
+            "Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Obtaining file:///content/mmaction2\n",
+            "  Running command python setup.py egg_info\n",
+            "  running egg_info\n",
+            "  creating /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info\n",
+            "  writing /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/PKG-INFO\n",
+            "  writing dependency_links to /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/dependency_links.txt\n",
+            "  writing requirements to /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/requires.txt\n",
+            "  writing top-level names to /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/top_level.txt\n",
+            "  writing manifest file '/tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/SOURCES.txt'\n",
+            "  reading manifest file '/tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/SOURCES.txt'\n",
+            "  reading manifest template 'MANIFEST.in'\n",
+            "  warning: no files found matching 'mmaction/.mim/model-index.yml'\n",
+            "  warning: no files found matching '*.py' under directory 'mmaction/.mim/configs'\n",
+            "  warning: no files found matching '*.yml' under directory 'mmaction/.mim/configs'\n",
+            "  warning: no files found matching '*.sh' under directory 'mmaction/.mim/tools'\n",
+            "  warning: no files found matching '*.py' under directory 'mmaction/.mim/tools'\n",
+            "  adding license file 'LICENSE'\n",
+            "  writing manifest file '/tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/SOURCES.txt'\n",
+            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Collecting decord>=0.4.1 (from mmaction2==1.0.0)\n",
+            "  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.6/13.6 MB\u001b[0m \u001b[31m71.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting einops (from mmaction2==1.0.0)\n",
+            "  Downloading einops-0.6.1-py3-none-any.whl (42 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.2/42.2 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (3.7.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.22.4)\n",
+            "Requirement already satisfied: opencv-contrib-python in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (4.7.0.72)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (8.4.0)\n",
+            "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.10.1)\n",
+            "Requirement already satisfied: torch>=1.3 in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (2.0.1+cu118)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.12.0)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (4.5.0)\n",
+            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (1.11.1)\n",
+            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1)\n",
+            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1.2)\n",
+            "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (2.0.0)\n",
+            "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (3.25.2)\n",
+            "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (16.0.5)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.4.4)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (23.1)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (2.8.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmaction2==1.0.0) (1.16.0)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.3->mmaction2==1.0.0) (2.1.2)\n",
+            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.3->mmaction2==1.0.0) (1.3.0)\n",
+            "Installing collected packages: einops, decord, mmaction2\n",
+            "  Running setup.py develop for mmaction2\n",
+            "    Running command python setup.py develop\n",
+            "    running develop\n",
+            "    /usr/local/lib/python3.10/dist-packages/setuptools/command/develop.py:40: EasyInstallDeprecationWarning: easy_install command is deprecated.\n",
+            "    !!\n",
+            "\n",
+            "            ********************************************************************************\n",
+            "            Please avoid running ``setup.py`` and ``easy_install``.\n",
+            "            Instead, use pypa/build, pypa/installer, pypa/build or\n",
+            "            other standards-based tools.\n",
+            "\n",
+            "            See https://github.com/pypa/setuptools/issues/917 for details.\n",
+            "            ********************************************************************************\n",
+            "\n",
+            "    !!\n",
+            "      easy_install.initialize_options(self)\n",
+            "    /usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py:66: SetuptoolsDeprecationWarning: setup.py install is deprecated.\n",
+            "    !!\n",
+            "\n",
+            "            ********************************************************************************\n",
+            "            Please avoid running ``setup.py`` directly.\n",
+            "            Instead, use pypa/build, pypa/installer, pypa/build or\n",
+            "            other standards-based tools.\n",
+            "\n",
+            "            See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.\n",
+            "            ********************************************************************************\n",
+            "\n",
+            "    !!\n",
+            "      self.initialize_options()\n",
+            "    running egg_info\n",
+            "    creating mmaction2.egg-info\n",
+            "    writing mmaction2.egg-info/PKG-INFO\n",
+            "    writing dependency_links to mmaction2.egg-info/dependency_links.txt\n",
+            "    writing requirements to mmaction2.egg-info/requires.txt\n",
+            "    writing top-level names to mmaction2.egg-info/top_level.txt\n",
+            "    writing manifest file 'mmaction2.egg-info/SOURCES.txt'\n",
+            "    reading manifest file 'mmaction2.egg-info/SOURCES.txt'\n",
+            "    reading manifest template 'MANIFEST.in'\n",
+            "    adding license file 'LICENSE'\n",
+            "    writing manifest file 'mmaction2.egg-info/SOURCES.txt'\n",
+            "    running build_ext\n",
+            "    Creating /usr/local/lib/python3.10/dist-packages/mmaction2.egg-link (link to .)\n",
+            "    Adding mmaction2 1.0.0 to easy-install.pth file\n",
+            "\n",
+            "    Installed /content/mmaction2\n",
+            "Successfully installed decord-0.6.0 einops-0.6.1 mmaction2-1.0.0\n",
+            "/content/mmaction2/projects/stad_tutorial\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install -U openmim\n",
+        "!mim install mmengine\n",
+        "!mim install mmcv\n",
+        "!mim install mmdet\n",
+        "\n",
+        "!git clone https://github.com/open-mmlab/mmaction2.git\n",
+        "\n",
+        "%cd mmaction2\n",
+        "%pip install -v -e .\n",
+        "%cd projects/stad_tutorial"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Ox0TM64FooZt"
+      },
+      "source": [
+        "## 1. Prepare spatio-temporal action detection dataset\n",
+        "\n",
+        "Similar to detection tasks that require bounding box annotations, spatio-temporal action detection tasks require temporal and spatial localization, so more complex tube annotations are required. Taking the MultiSports dataset as an example, the `gttubes` field provides all the target action annotations in the video, and the following is an annotation fragment:\n",
+        "\n",
+        "```\n",
+        "    'gttubes': {\n",
+        "        'aerobic_gymnastics/v_aqMgwPExjD0_c001': # video_key\n",
+        "            {\n",
+        "                10: # label index\n",
+        "                    [\n",
+        "                        array([[ 377.,  904.,  316., 1016.,  584.], # 1st tube of class 10\n",
+        "                               [ 378.,  882.,  315., 1016.,  579.], # shape (n, 5): n frames，each annotation includes (frame idx，x1，y1, x2, y2)\n",
+        "                               ...\n",
+        "                               [ 398.,  861.,  304.,  954.,  549.]], dtype=float32)，\n",
+        "\n",
+        "                        array([[ 399.,  881.,  308.,  955.,  542.], # 2nd tube of class 10\n",
+        "                               [ 400.,  862.,  303.,  988.,  539.],\n",
+        "                               [ 401.,  853.,  292., 1000.,  535.],\n",
+        "                               ...])\n",
+        "                        ...\n",
+        "\n",
+        "                    ] ,\n",
+        "                9: # label index\n",
+        "                    [\n",
+        "                        array(...), # 1st tube of class 9\n",
+        "                        array(...), # 2nd tube of class 9\n",
+        "                        ...\n",
+        "                    ]\n",
+        "                ...\n",
+        "            }\n",
+        "    }\n",
+        "```\n",
+        "\n",
+        "The annotation file also needs to provide other field information, and the complete ground truth file includes the following information:\n",
+        "\n",
+        "```\n",
+        "{\n",
+        "    'labels':  # label list\n",
+        "        ['aerobic push up', 'aerobic explosive push up', ...],\n",
+        "    'train_videos':  # training video list\n",
+        "        [\n",
+        "            [\n",
+        "                'aerobic_gymnastics/v_aqMgwPExjD0_c001',\n",
+        "                'aerobic_gymnastics/v_yaKOumdXwbU_c019',\n",
+        "                ...\n",
+        "            ]\n",
+        "        ]\n",
+        "    'test_videos':  # test video list\n",
+        "        [\n",
+        "            [\n",
+        "                'aerobic_gymnastics/v_crsi07chcV8_c004',\n",
+        "                'aerobic_gymnastics/v_dFYr67eNMwA_c005',\n",
+        "                ...\n",
+        "            ]\n",
+        "        ]\n",
+        "    'n_frames':  # dict provides frame number of each video\n",
+        "        {\n",
+        "            'aerobic_gymnastics/v_crsi07chcV8_c004': 725,\n",
+        "            'aerobic_gymnastics/v_dFYr67eNMwA_c005': 750,\n",
+        "            ...\n",
+        "        }\n",
+        "    'resolution':  # dict provides resolution of each video\n",
+        "        {\n",
+        "            'aerobic_gymnastics/v_crsi07chcV8_c004': (720, 1280),\n",
+        "            'aerobic_gymnastics/v_dFYr67eNMwA_c005': (720, 1280),\n",
+        "            ...\n",
+        "        }\n",
+        "    'gt_tubes':  # dict provides bouding boxes of each tube\n",
+        "        {\n",
+        "            ... # refer to above description\n",
+        "        }\n",
+        "}\n",
+        "```\n",
+        "\n",
+        "The subsequent experiments are based on MultiSports-tiny, we extracted a small number of videos from MultiSports for demonstration purposes."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "n5AzsRvdooZv",
+        "outputId": "a6cad83b-4613-43cc-8c09-86ac79242656"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "--2023-06-15 06:00:15--  https://download.openmmlab.com/mmaction/v1.0/projects/stad_tutorial/multisports-tiny.tar\n",
+            "Resolving download.openmmlab.com (download.openmmlab.com)... 163.181.82.215, 163.181.82.216, 163.181.82.218, ...\n",
+            "Connecting to download.openmmlab.com (download.openmmlab.com)|163.181.82.215|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 82780160 (79M) [application/x-tar]\n",
+            "Saving to: ‘data/multisports-tiny.tar’\n",
+            "\n",
+            "multisports-tiny.ta 100%[===================>]  78.95M  13.3MB/s    in 44s     \n",
+            "\n",
+            "2023-06-15 06:01:00 (1.78 MB/s) - ‘data/multisports-tiny.tar’ saved [82780160/82780160]\n",
+            "\n",
+            "multisports-tiny/multisports/\n",
+            "multisports-tiny/multisports/test/\n",
+            "multisports-tiny/multisports/test/aerobic_gymnastics/\n",
+            "multisports-tiny/multisports/test/aerobic_gymnastics/v_7G_IpU0FxLU_c001.mp4\n",
+            "multisports-tiny/multisports/annotations/\n",
+            "multisports-tiny/multisports/annotations/multisports_GT.pkl\n",
+            "multisports-tiny/multisports/trainval/\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c001.mp4\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c003.mp4\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c002.mp4\n",
+            "Reading package lists...\n",
+            "Building dependency tree...\n",
+            "Reading state information...\n",
+            "The following NEW packages will be installed:\n",
+            "  tree\n",
+            "0 upgraded, 1 newly installed, 0 to remove and 46 not upgraded.\n",
+            "Need to get 43.0 kB of archives.\n",
+            "After this operation, 115 kB of additional disk space will be used.\n",
+            "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tree amd64 1.8.0-1 [43.0 kB]\n",
+            "Fetched 43.0 kB in 1s (43.0 kB/s)\n",
+            "Selecting previously unselected package tree.\n",
+            "(Reading database ... 122541 files and directories currently installed.)\n",
+            "Preparing to unpack .../tree_1.8.0-1_amd64.deb ...\n",
+            "Unpacking tree (1.8.0-1) ...\n",
+            "Setting up tree (1.8.0-1) ...\n",
+            "Processing triggers for man-db (2.9.1-1) ...\n",
+            "\u001b[01;34mdata\u001b[00m\n",
+            "├── \u001b[01;34mmultisports\u001b[00m\n",
+            "│   ├── \u001b[01;34mannotations\u001b[00m\n",
+            "│   │   └── \u001b[01;32mmultisports_GT.pkl\u001b[00m\n",
+            "│   ├── \u001b[01;34mtest\u001b[00m\n",
+            "│   │   └── \u001b[01;34maerobic_gymnastics\u001b[00m\n",
+            "│   │       └── \u001b[01;32mv_7G_IpU0FxLU_c001.mp4\u001b[00m\n",
+            "│   └── \u001b[01;34mtrainval\u001b[00m\n",
+            "│       └── \u001b[01;34maerobic_gymnastics\u001b[00m\n",
+            "│           ├── \u001b[01;32mv__wAgwttPYaQ_c001.mp4\u001b[00m\n",
+            "│           ├── \u001b[01;32mv__wAgwttPYaQ_c002.mp4\u001b[00m\n",
+            "│           └── \u001b[01;32mv__wAgwttPYaQ_c003.mp4\u001b[00m\n",
+            "└── \u001b[01;31mmultisports-tiny.tar\u001b[00m\n",
+            "\n",
+            "6 directories, 6 files\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Download dataset\n",
+        "!wget -P data -c https://download.openmmlab.com/mmaction/v1.0/projects/stad_tutorial/multisports-tiny.tar\n",
+        "!tar -xvf data/multisports-tiny.tar --strip 1 -C data\n",
+        "!apt-get -q install tree\n",
+        "!tree data"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_u69LHscooZw"
+      },
+      "source": [
+        "## 2. Train detection model\n",
+        "\n",
+        "In the SlowOnly + Det paradigm, we need to train a human detector first, and then predict actions based on the detection results. In this section, we train a detection model based on the annotation format in the previous section and the MMDetection algorithm library.\n",
+        "\n",
+        "### 2.1 Build detection dataset annotation (COCO format)\n",
+        "\n",
+        "Based on the annotation information of the spatio-temporal action detection dataset, we can build a COCO format detection dataset for training the detection model. We provide a script to convert the MultiSports format annotation, if you need to convert from other formats, you can refer to the [custom dataset](https://mmdetection.readthedocs.io/zh_CN/latest/advanced_guides/customize_dataset.html) document provided by MMDetection."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "e8fu9VtRooZw",
+        "outputId": "3e7a7053-a08d-4c32-9d66-a362b3de164d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\u001b[01;34mdata/multisports/annotations\u001b[00m\n",
+            "├── multisports_det_anno_train.json\n",
+            "├── multisports_det_anno_val.json\n",
+            "└── \u001b[01;32mmultisports_GT.pkl\u001b[00m\n",
+            "\n",
+            "0 directories, 3 files\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python tools/generate_mmdet_anno.py data/multisports/annotations/multisports_GT.pkl data/multisports/annotations/multisports_det_anno.json\n",
+        "!tree data/multisports/annotations"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "HJAb8EwwooZx",
+        "outputId": "1c82387c-c731-484c-a4cc-8c255b3f2e62"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Will generate 3 rgb dir for aerobic_gymnastics.\n",
+            "Generate v__wAgwttPYaQ_c003 rgb dir successfully.\n",
+            "Generate v__wAgwttPYaQ_c002 rgb dir successfully.\n",
+            "Generate v__wAgwttPYaQ_c001 rgb dir successfully.\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python tools/generate_rgb.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9xIOk_XkooZx"
+      },
+      "source": [
+        "### 2.2 Modify config file\n",
+        "\n",
+        "We use faster-rcnn_x101-64x4d_fpn_1x_coco as the base configuration, and make the following modifications to train on the MultiSports dataset. The following parts need to be modified:\n",
+        "- Number of model categories\n",
+        "- Learning rate adjustment strategy\n",
+        "- Optimizer configuration\n",
+        "- Dataset/annotation file path\n",
+        "- Evaluator configuration\n",
+        "- Pre-trained model\n",
+        "\n",
+        "For more detailed tutorials, please refer to the [prepare configuration file](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/train.html#id9) document provided by MMDetection."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Ad1QLNM8ooZy",
+        "outputId": "55f95e91-8fdf-40fa-dd08-5fa980444b6f"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "# Copyright (c) OpenMMLab. All rights reserved.\n",
+            "_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py'\n",
+            "model = dict(roi_head=dict(bbox_head=dict(num_classes=1)))\n",
+            "\n",
+            "# take 2 epochs as an example\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n",
+            "\n",
+            "# learning rate\n",
+            "param_scheduler = [\n",
+            "    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n",
+            "]\n",
+            "\n",
+            "# optimizer\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.0050, momentum=0.9, weight_decay=0.0001))\n",
+            "\n",
+            "dataset_type = 'CocoDataset'\n",
+            "# modify metainfo\n",
+            "metainfo = {\n",
+            "    'classes': ('person', ),\n",
+            "    'palette': [\n",
+            "        (220, 20, 60),\n",
+            "    ]\n",
+            "}\n",
+            "\n",
+            "# specify metainfo, dataset path\n",
+            "data_root = 'data/multisports/'\n",
+            "\n",
+            "train_dataloader = dict(\n",
+            "    dataset=dict(\n",
+            "        data_root=data_root,\n",
+            "        ann_file='annotations/multisports_det_anno_train.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        metainfo=metainfo))\n",
+            "\n",
+            "val_dataloader = dict(\n",
+            "    dataset=dict(\n",
+            "        data_root=data_root,\n",
+            "        ann_file='annotations/multisports_det_anno_val.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        metainfo=metainfo))\n",
+            "\n",
+            "test_dataloader = dict(\n",
+            "    dataset=dict(\n",
+            "        data_root=data_root,\n",
+            "        ann_file='annotations/ms_infer_anno.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        metainfo=metainfo))\n",
+            "\n",
+            "# specify annotaition file path, modify metric items\n",
+            "val_evaluator = dict(\n",
+            "    ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5],\n",
+            ")\n",
+            "\n",
+            "test_evaluator = dict(\n",
+            "    ann_file='data/multisports/annotations/ms_infer_anno.json',\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5],\n",
+            ")\n",
+            "\n",
+            "# specify pretrain checkpoint\n",
+            "load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501\n"
+          ]
+        }
+      ],
+      "source": [
+        "!cat configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "W40JO80nooZ0"
+      },
+      "source": [
+        "### 2.3 Train detection model"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Oc1LWr4AooZ0"
+      },
+      "source": [
+        "By using MIM, you can directly train MMDetection models in the current directory. Here is the simplest example of training on a single GPU. For more training commands, please refer to the MIM [tutorial](https://github.com/open-mmlab/mim#command)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "QpxCbvr2ooZ0",
+        "outputId": "ffe7b420-c359-4e5a-a1b1-3a75e923046d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training command is /usr/bin/python3 /usr/local/lib/python3.10/dist-packages/mmdet/.mim/tools/train.py configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py --launcher none --work-dir work_dirs/det_model. \n",
+            "06/15 06:02:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 503128501\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "    PyTorch: 2.0.1+cu118\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 9.3\n",
+            "  - C++ Version: 201703\n",
+            "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.8\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+            "  - CuDNN 8.7\n",
+            "  - Magma 2.6.1\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+            "\n",
+            "    TorchVision: 0.15.2+cu118\n",
+            "    OpenCV: 4.7.0\n",
+            "    MMEngine: 0.7.4\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 503128501\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "06/15 06:02:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+            "model = dict(\n",
+            "    type='FasterRCNN',\n",
+            "    data_preprocessor=dict(\n",
+            "        type='DetDataPreprocessor',\n",
+            "        mean=[103.53, 116.28, 123.675],\n",
+            "        std=[1.0, 1.0, 1.0],\n",
+            "        bgr_to_rgb=False,\n",
+            "        pad_size_divisor=32),\n",
+            "    backbone=dict(\n",
+            "        type='ResNet',\n",
+            "        depth=50,\n",
+            "        num_stages=4,\n",
+            "        out_indices=(0, 1, 2, 3),\n",
+            "        frozen_stages=1,\n",
+            "        norm_cfg=dict(type='BN', requires_grad=False),\n",
+            "        norm_eval=True,\n",
+            "        style='caffe',\n",
+            "        init_cfg=dict(\n",
+            "            type='Pretrained',\n",
+            "            checkpoint='open-mmlab://detectron2/resnet50_caffe')),\n",
+            "    neck=dict(\n",
+            "        type='FPN',\n",
+            "        in_channels=[256, 512, 1024, 2048],\n",
+            "        out_channels=256,\n",
+            "        num_outs=5),\n",
+            "    rpn_head=dict(\n",
+            "        type='RPNHead',\n",
+            "        in_channels=256,\n",
+            "        feat_channels=256,\n",
+            "        anchor_generator=dict(\n",
+            "            type='AnchorGenerator',\n",
+            "            scales=[8],\n",
+            "            ratios=[0.5, 1.0, 2.0],\n",
+            "            strides=[4, 8, 16, 32, 64]),\n",
+            "        bbox_coder=dict(\n",
+            "            type='DeltaXYWHBBoxCoder',\n",
+            "            target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "            target_stds=[1.0, 1.0, 1.0, 1.0]),\n",
+            "        loss_cls=dict(\n",
+            "            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n",
+            "        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),\n",
+            "    roi_head=dict(\n",
+            "        type='StandardRoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor',\n",
+            "            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n",
+            "            out_channels=256,\n",
+            "            featmap_strides=[4, 8, 16, 32]),\n",
+            "        bbox_head=dict(\n",
+            "            type='Shared2FCBBoxHead',\n",
+            "            in_channels=256,\n",
+            "            fc_out_channels=1024,\n",
+            "            roi_feat_size=7,\n",
+            "            num_classes=1,\n",
+            "            bbox_coder=dict(\n",
+            "                type='DeltaXYWHBBoxCoder',\n",
+            "                target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "                target_stds=[0.1, 0.1, 0.2, 0.2]),\n",
+            "            reg_class_agnostic=False,\n",
+            "            loss_cls=dict(\n",
+            "                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n",
+            "            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),\n",
+            "    train_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.7,\n",
+            "                neg_iou_thr=0.3,\n",
+            "                min_pos_iou=0.3,\n",
+            "                match_low_quality=True,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=256,\n",
+            "                pos_fraction=0.5,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=False),\n",
+            "            allowed_border=-1,\n",
+            "            pos_weight=-1,\n",
+            "            debug=False),\n",
+            "        rpn_proposal=dict(\n",
+            "            nms_pre=2000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.5,\n",
+            "                neg_iou_thr=0.5,\n",
+            "                min_pos_iou=0.5,\n",
+            "                match_low_quality=False,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=512,\n",
+            "                pos_fraction=0.25,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=-1,\n",
+            "            debug=False)),\n",
+            "    test_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            nms_pre=1000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            score_thr=0.05,\n",
+            "            nms=dict(type='nms', iou_threshold=0.5),\n",
+            "            max_per_img=100)))\n",
+            "dataset_type = 'CocoDataset'\n",
+            "data_root = 'data/multisports/'\n",
+            "backend_args = None\n",
+            "train_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='RandomChoiceResize',\n",
+            "        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                (1333, 768), (1333, 800)],\n",
+            "        keep_ratio=True),\n",
+            "    dict(type='RandomFlip', prob=0.5),\n",
+            "    dict(type='PackDetInputs')\n",
+            "]\n",
+            "test_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='PackDetInputs',\n",
+            "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                   'scale_factor'))\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+            "    batch_sampler=dict(type='AspectRatioBatchSampler'),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_train.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        filter_cfg=dict(filter_empty_gt=True, min_size=32),\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='RandomChoiceResize',\n",
+            "                scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                        (1333, 768), (1333, 800)],\n",
+            "                keep_ratio=True),\n",
+            "            dict(type='RandomFlip', prob=0.5),\n",
+            "            dict(type='PackDetInputs')\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_val.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/ms_infer_anno.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "test_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/ms_infer_anno.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n",
+            "val_cfg = dict(type='ValLoop')\n",
+            "test_cfg = dict(type='TestLoop')\n",
+            "param_scheduler = [\n",
+            "    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))\n",
+            "auto_scale_lr = dict(enable=False, base_batch_size=16)\n",
+            "default_scope = 'mmdet'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='DetVisualizationHook'))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='DetLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='visualizer')\n",
+            "log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)\n",
+            "log_level = 'INFO'\n",
+            "load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'\n",
+            "resume = False\n",
+            "metainfo = dict(classes=('person', ), palette=[(220, 20, 60)])\n",
+            "launcher = 'none'\n",
+            "work_dir = 'work_dirs/det_model'\n",
+            "\n",
+            "06/15 06:02:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "06/15 06:02:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+            "before_run:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "before_train:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_train_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DistSamplerSeedHook                \n",
+            " -------------------- \n",
+            "before_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_val_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_val_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train:\n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_test_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_test_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_run:\n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "loading annotations into memory...\n",
+            "Done (t=0.01s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "06/15 06:02:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - load model from: open-mmlab://detectron2/resnet50_caffe\n",
+            "06/15 06:02:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Loads checkpoint by openmmlab backend from path: open-mmlab://detectron2/resnet50_caffe\n",
+            "Downloading: \"https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth\" to /root/.cache/torch/hub/checkpoints/resnet50_msra-5891d200.pth\n",
+            "100% 89.9M/89.9M [00:02<00:00, 34.8MB/s]\n",
+            "06/15 06:02:21 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
+            "\n",
+            "unexpected key in source state_dict: conv1.bias\n",
+            "\n",
+            "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n",
+            "Downloading: \"https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\" to /root/.cache/torch/hub/checkpoints/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n",
+            "100% 158M/158M [00:04<00:00, 37.4MB/s]\n",
+            "06/15 06:02:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n",
+            "06/15 06:02:26 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n",
+            "06/15 06:02:26 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n",
+            "06/15 06:02:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Checkpoints will be saved to /content/mmaction2/projects/stad_tutorial/work_dirs/det_model.\n",
+            "06/15 06:02:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 50/118]  lr: 5.0000e-03  eta: 0:01:56  time: 0.6273  data_time: 0.0111  memory: 3414  loss: 0.5456  loss_rpn_cls: 0.0070  loss_rpn_bbox: 0.0167  loss_cls: 0.1887  acc: 93.2617  loss_bbox: 0.3332\n",
+            "06/15 06:03:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][100/118]  lr: 5.0000e-03  eta: 0:01:16  time: 0.5041  data_time: 0.0078  memory: 3414  loss: 0.4017  loss_rpn_cls: 0.0027  loss_rpn_bbox: 0.0130  loss_cls: 0.1313  acc: 94.8242  loss_bbox: 0.2547\n",
+            "06/15 06:03:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person_20230615_060208\n",
+            "06/15 06:03:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 1 epochs\n",
+            "06/15 06:03:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 50/120]    eta: 0:00:08  time: 0.1196  data_time: 0.0059  memory: 3414  \n",
+            "06/15 06:03:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][100/120]    eta: 0:00:02  time: 0.1234  data_time: 0.0082  memory: 679  \n",
+            "06/15 06:03:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n",
+            "Loading and preparing results...\n",
+            "DONE (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "Running per image evaluation...\n",
+            "Evaluate annotation type *bbox*\n",
+            "DONE (t=0.05s).\n",
+            "Accumulating evaluation results...\n",
+            "DONE (t=0.01s).\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.872\n",
+            " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 0.709\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.886\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.964\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=300 ] = 0.964\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=1000 ] = 0.964\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.963\n",
+            "06/15 06:03:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: 0.872 -1.000 -1.000 -1.000 0.709 0.886\n",
+            "06/15 06:03:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120]    coco/bbox_mAP_50: -1.0000  coco/bbox_AR@100: 0.9640  data_time: 0.0067  time: 0.1212\n",
+            "06/15 06:04:14 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 50/118]  lr: 5.0000e-03  eta: 0:00:37  time: 0.5316  data_time: 0.0094  memory: 3414  loss: 0.3385  loss_rpn_cls: 0.0012  loss_rpn_bbox: 0.0111  loss_cls: 0.1119  acc: 95.4102  loss_bbox: 0.2143\n",
+            "06/15 06:04:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][100/118]  lr: 5.0000e-03  eta: 0:00:09  time: 0.5152  data_time: 0.0078  memory: 3414  loss: 0.3152  loss_rpn_cls: 0.0017  loss_rpn_bbox: 0.0109  loss_cls: 0.1050  acc: 94.7266  loss_bbox: 0.1977\n",
+            "06/15 06:04:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person_20230615_060208\n",
+            "06/15 06:04:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 2 epochs\n",
+            "06/15 06:04:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 50/120]    eta: 0:00:08  time: 0.1237  data_time: 0.0080  memory: 3414  \n",
+            "06/15 06:05:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][100/120]    eta: 0:00:02  time: 0.1202  data_time: 0.0062  memory: 679  \n",
+            "06/15 06:05:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n",
+            "Loading and preparing results...\n",
+            "DONE (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "Running per image evaluation...\n",
+            "Evaluate annotation type *bbox*\n",
+            "DONE (t=0.04s).\n",
+            "Accumulating evaluation results...\n",
+            "DONE (t=0.01s).\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.907\n",
+            " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 0.762\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.910\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.960\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=300 ] = 0.960\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=1000 ] = 0.960\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.960\n",
+            "06/15 06:05:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: 0.907 -1.000 -1.000 -1.000 0.762 0.910\n",
+            "06/15 06:05:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120]    coco/bbox_mAP_50: -1.0000  coco/bbox_AR@100: 0.9600  data_time: 0.0066  time: 0.1214\n",
+            "\u001b[32mTraining finished successfully. \u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "!mim train mmdet configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n",
+        "    --work-dir work_dirs/det_model"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IxlO927KooZ1"
+      },
+      "source": [
+        "### 2.4 Generating Proposal BBoxes\n",
+        "\n",
+        "During the training of the spatiotemporal action detection model, we need to rely on proposals generated by the detection model, rather than annotated detection boxes. Therefore, we need to use a trained detection model to perform inference on the entire dataset and convert the resulting proposals into the required format for subsequent training.\n",
+        "\n",
+        "#### 2.4.1 Converting the Dataset to Coco Format\n",
+        "\n",
+        "We provide a script to convert the MultiSports dataset into an annotation format without ground truth, which is used for inference."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "e6C7D2DSooZ1",
+        "outputId": "878015d1-0fc7-4eb6-af77-4f61aefcf2b2"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[>>] 2350/2350, 2053.0 task/s, elapsed: 1s, ETA:     0s\n",
+            "save json file: data/multisports/rawframes/../annotations/ms_infer_anno.json\n"
+          ]
+        }
+      ],
+      "source": [
+        "!echo 'person' > data/multisports/annotations/label_map.txt\n",
+        "!python tools/images2coco.py \\\n",
+        "        data/multisports/rawframes \\\n",
+        "        data/multisports/annotations/label_map.txt \\\n",
+        "        ms_infer_anno.json"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fGL3t4MEooZ1"
+      },
+      "source": [
+        "#### 2.4.2 Inference for Generating Proposal Files\n"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gerYk6q6ooZ1"
+      },
+      "source": [
+        "The inference of MMDetection models is also based on MIM. For more testing commands, please refer to the MIM [tutorial](GitHub - open-mmlab/mim: MIM Installs OpenMMLab Packages).\n",
+        "\n",
+        "After the inference is completed, the results will be saved in 'data/multisports/ms_proposals.pkl'."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "lutiaqzpooZ1",
+        "outputId": "b05db6e8-04de-4e1e-8d99-32f4c952d633"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Testing command is /usr/bin/python3 /usr/local/lib/python3.10/dist-packages/mmdet/.mim/tools/test.py configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py work_dirs/det_model/epoch_2.pth --launcher none --out data/multisports/annotations/ms_det_proposals.pkl. \n",
+            "06/15 06:05:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 1289054678\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "    PyTorch: 2.0.1+cu118\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 9.3\n",
+            "  - C++ Version: 201703\n",
+            "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.8\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+            "  - CuDNN 8.7\n",
+            "  - Magma 2.6.1\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+            "\n",
+            "    TorchVision: 0.15.2+cu118\n",
+            "    OpenCV: 4.7.0\n",
+            "    MMEngine: 0.7.4\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 1289054678\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "06/15 06:05:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+            "model = dict(\n",
+            "    type='FasterRCNN',\n",
+            "    data_preprocessor=dict(\n",
+            "        type='DetDataPreprocessor',\n",
+            "        mean=[103.53, 116.28, 123.675],\n",
+            "        std=[1.0, 1.0, 1.0],\n",
+            "        bgr_to_rgb=False,\n",
+            "        pad_size_divisor=32),\n",
+            "    backbone=dict(\n",
+            "        type='ResNet',\n",
+            "        depth=50,\n",
+            "        num_stages=4,\n",
+            "        out_indices=(0, 1, 2, 3),\n",
+            "        frozen_stages=1,\n",
+            "        norm_cfg=dict(type='BN', requires_grad=False),\n",
+            "        norm_eval=True,\n",
+            "        style='caffe',\n",
+            "        init_cfg=dict(\n",
+            "            type='Pretrained',\n",
+            "            checkpoint='open-mmlab://detectron2/resnet50_caffe')),\n",
+            "    neck=dict(\n",
+            "        type='FPN',\n",
+            "        in_channels=[256, 512, 1024, 2048],\n",
+            "        out_channels=256,\n",
+            "        num_outs=5),\n",
+            "    rpn_head=dict(\n",
+            "        type='RPNHead',\n",
+            "        in_channels=256,\n",
+            "        feat_channels=256,\n",
+            "        anchor_generator=dict(\n",
+            "            type='AnchorGenerator',\n",
+            "            scales=[8],\n",
+            "            ratios=[0.5, 1.0, 2.0],\n",
+            "            strides=[4, 8, 16, 32, 64]),\n",
+            "        bbox_coder=dict(\n",
+            "            type='DeltaXYWHBBoxCoder',\n",
+            "            target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "            target_stds=[1.0, 1.0, 1.0, 1.0]),\n",
+            "        loss_cls=dict(\n",
+            "            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n",
+            "        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),\n",
+            "    roi_head=dict(\n",
+            "        type='StandardRoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor',\n",
+            "            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n",
+            "            out_channels=256,\n",
+            "            featmap_strides=[4, 8, 16, 32]),\n",
+            "        bbox_head=dict(\n",
+            "            type='Shared2FCBBoxHead',\n",
+            "            in_channels=256,\n",
+            "            fc_out_channels=1024,\n",
+            "            roi_feat_size=7,\n",
+            "            num_classes=1,\n",
+            "            bbox_coder=dict(\n",
+            "                type='DeltaXYWHBBoxCoder',\n",
+            "                target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "                target_stds=[0.1, 0.1, 0.2, 0.2]),\n",
+            "            reg_class_agnostic=False,\n",
+            "            loss_cls=dict(\n",
+            "                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n",
+            "            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),\n",
+            "    train_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.7,\n",
+            "                neg_iou_thr=0.3,\n",
+            "                min_pos_iou=0.3,\n",
+            "                match_low_quality=True,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=256,\n",
+            "                pos_fraction=0.5,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=False),\n",
+            "            allowed_border=-1,\n",
+            "            pos_weight=-1,\n",
+            "            debug=False),\n",
+            "        rpn_proposal=dict(\n",
+            "            nms_pre=2000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.5,\n",
+            "                neg_iou_thr=0.5,\n",
+            "                min_pos_iou=0.5,\n",
+            "                match_low_quality=False,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=512,\n",
+            "                pos_fraction=0.25,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=-1,\n",
+            "            debug=False)),\n",
+            "    test_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            nms_pre=1000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            score_thr=0.05,\n",
+            "            nms=dict(type='nms', iou_threshold=0.5),\n",
+            "            max_per_img=100)))\n",
+            "dataset_type = 'CocoDataset'\n",
+            "data_root = 'data/multisports/'\n",
+            "backend_args = None\n",
+            "train_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='RandomChoiceResize',\n",
+            "        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                (1333, 768), (1333, 800)],\n",
+            "        keep_ratio=True),\n",
+            "    dict(type='RandomFlip', prob=0.5),\n",
+            "    dict(type='PackDetInputs')\n",
+            "]\n",
+            "test_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='PackDetInputs',\n",
+            "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                   'scale_factor'))\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+            "    batch_sampler=dict(type='AspectRatioBatchSampler'),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_train.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        filter_cfg=dict(filter_empty_gt=True, min_size=32),\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='RandomChoiceResize',\n",
+            "                scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                        (1333, 768), (1333, 800)],\n",
+            "                keep_ratio=True),\n",
+            "            dict(type='RandomFlip', prob=0.5),\n",
+            "            dict(type='PackDetInputs')\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_val.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/ms_infer_anno.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "test_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/ms_infer_anno.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n",
+            "val_cfg = dict(type='ValLoop')\n",
+            "test_cfg = dict(type='TestLoop')\n",
+            "param_scheduler = [\n",
+            "    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))\n",
+            "auto_scale_lr = dict(enable=False, base_batch_size=16)\n",
+            "default_scope = 'mmdet'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='DetVisualizationHook'))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='DetLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='visualizer')\n",
+            "log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)\n",
+            "log_level = 'INFO'\n",
+            "load_from = 'work_dirs/det_model/epoch_2.pth'\n",
+            "resume = False\n",
+            "metainfo = dict(classes=('person', ), palette=[(220, 20, 60)])\n",
+            "launcher = 'none'\n",
+            "work_dir = './work_dirs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person'\n",
+            "\n",
+            "06/15 06:05:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "06/15 06:05:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+            "before_run:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "before_train:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_train_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DistSamplerSeedHook                \n",
+            " -------------------- \n",
+            "before_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_val_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_val_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train:\n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_test_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_test_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_run:\n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "06/15 06:05:20 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The prefix is not set in metric class DumpDetResults.\n",
+            "Loads checkpoint by local backend from path: work_dirs/det_model/epoch_2.pth\n",
+            "06/15 06:05:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from work_dirs/det_model/epoch_2.pth\n",
+            "06/15 06:05:28 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [  50/2350]    eta: 0:05:50  time: 0.1523  data_time: 0.0084  memory: 512  \n",
+            "06/15 06:05:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 100/2350]    eta: 0:05:05  time: 0.1191  data_time: 0.0042  memory: 512  \n",
+            "06/15 06:05:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 150/2350]    eta: 0:04:45  time: 0.1178  data_time: 0.0023  memory: 512  \n",
+            "06/15 06:05:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 200/2350]    eta: 0:04:36  time: 0.1255  data_time: 0.0074  memory: 512  \n",
+            "06/15 06:05:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 250/2350]    eta: 0:04:26  time: 0.1205  data_time: 0.0031  memory: 512  \n",
+            "06/15 06:05:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 300/2350]    eta: 0:04:19  time: 0.1238  data_time: 0.0063  memory: 512  \n",
+            "06/15 06:06:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 350/2350]    eta: 0:04:11  time: 0.1206  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:06:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 400/2350]    eta: 0:04:03  time: 0.1178  data_time: 0.0030  memory: 512  \n",
+            "06/15 06:06:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 450/2350]    eta: 0:03:56  time: 0.1212  data_time: 0.0058  memory: 512  \n",
+            "06/15 06:06:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 500/2350]    eta: 0:03:48  time: 0.1165  data_time: 0.0031  memory: 512  \n",
+            "06/15 06:06:28 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 550/2350]    eta: 0:03:41  time: 0.1202  data_time: 0.0061  memory: 512  \n",
+            "06/15 06:06:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 600/2350]    eta: 0:03:34  time: 0.1179  data_time: 0.0044  memory: 512  \n",
+            "06/15 06:06:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 650/2350]    eta: 0:03:27  time: 0.1156  data_time: 0.0024  memory: 512  \n",
+            "06/15 06:06:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 700/2350]    eta: 0:03:21  time: 0.1212  data_time: 0.0058  memory: 512  \n",
+            "06/15 06:06:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 750/2350]    eta: 0:03:14  time: 0.1161  data_time: 0.0025  memory: 512  \n",
+            "06/15 06:06:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 800/2350]    eta: 0:03:08  time: 0.1200  data_time: 0.0058  memory: 512  \n",
+            "06/15 06:07:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 850/2350]    eta: 0:03:02  time: 0.1203  data_time: 0.0053  memory: 512  \n",
+            "06/15 06:07:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 900/2350]    eta: 0:02:55  time: 0.1177  data_time: 0.0030  memory: 512  \n",
+            "06/15 06:07:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 950/2350]    eta: 0:02:50  time: 0.1233  data_time: 0.0076  memory: 512  \n",
+            "06/15 06:07:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1000/2350]    eta: 0:02:43  time: 0.1172  data_time: 0.0025  memory: 512  \n",
+            "06/15 06:07:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1050/2350]    eta: 0:02:37  time: 0.1202  data_time: 0.0053  memory: 512  \n",
+            "06/15 06:07:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1100/2350]    eta: 0:02:31  time: 0.1208  data_time: 0.0059  memory: 512  \n",
+            "06/15 06:07:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1150/2350]    eta: 0:02:25  time: 0.1167  data_time: 0.0030  memory: 512  \n",
+            "06/15 06:07:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1200/2350]    eta: 0:02:19  time: 0.1212  data_time: 0.0053  memory: 512  \n",
+            "06/15 06:07:51 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1250/2350]    eta: 0:02:12  time: 0.1163  data_time: 0.0027  memory: 512  \n",
+            "06/15 06:07:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1300/2350]    eta: 0:02:06  time: 0.1188  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:08:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1350/2350]    eta: 0:02:00  time: 0.1201  data_time: 0.0056  memory: 512  \n",
+            "06/15 06:08:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1400/2350]    eta: 0:01:54  time: 0.1161  data_time: 0.0024  memory: 512  \n",
+            "06/15 06:08:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1450/2350]    eta: 0:01:48  time: 0.1234  data_time: 0.0079  memory: 512  \n",
+            "06/15 06:08:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1500/2350]    eta: 0:01:42  time: 0.1165  data_time: 0.0024  memory: 512  \n",
+            "06/15 06:08:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1550/2350]    eta: 0:01:36  time: 0.1191  data_time: 0.0043  memory: 512  \n",
+            "06/15 06:08:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1600/2350]    eta: 0:01:30  time: 0.1219  data_time: 0.0071  memory: 512  \n",
+            "06/15 06:08:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1650/2350]    eta: 0:01:24  time: 0.1166  data_time: 0.0026  memory: 512  \n",
+            "06/15 06:08:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1700/2350]    eta: 0:01:18  time: 0.1224  data_time: 0.0067  memory: 512  \n",
+            "06/15 06:08:51 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1750/2350]    eta: 0:01:12  time: 0.1175  data_time: 0.0032  memory: 512  \n",
+            "06/15 06:08:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1800/2350]    eta: 0:01:06  time: 0.1186  data_time: 0.0041  memory: 512  \n",
+            "06/15 06:09:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1850/2350]    eta: 0:01:00  time: 0.1227  data_time: 0.0067  memory: 512  \n",
+            "06/15 06:09:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1900/2350]    eta: 0:00:54  time: 0.1220  data_time: 0.0070  memory: 512  \n",
+            "06/15 06:09:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1950/2350]    eta: 0:00:48  time: 0.1229  data_time: 0.0081  memory: 512  \n",
+            "06/15 06:09:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2000/2350]    eta: 0:00:42  time: 0.1173  data_time: 0.0029  memory: 512  \n",
+            "06/15 06:09:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2050/2350]    eta: 0:00:36  time: 0.1184  data_time: 0.0037  memory: 512  \n",
+            "06/15 06:09:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2100/2350]    eta: 0:00:30  time: 0.1216  data_time: 0.0066  memory: 512  \n",
+            "06/15 06:09:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2150/2350]    eta: 0:00:24  time: 0.1166  data_time: 0.0026  memory: 512  \n",
+            "06/15 06:09:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2200/2350]    eta: 0:00:18  time: 0.1213  data_time: 0.0052  memory: 512  \n",
+            "06/15 06:09:51 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2250/2350]    eta: 0:00:12  time: 0.1180  data_time: 0.0033  memory: 512  \n",
+            "06/15 06:09:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2300/2350]    eta: 0:00:06  time: 0.1173  data_time: 0.0032  memory: 512  \n",
+            "06/15 06:10:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2350/2350]    eta: 0:00:00  time: 0.1203  data_time: 0.0048  memory: 512  \n",
+            "06/15 06:10:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n",
+            "Loading and preparing results...\n",
+            "DONE (t=0.01s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "Running per image evaluation...\n",
+            "Evaluate annotation type *bbox*\n",
+            "DONE (t=0.36s).\n",
+            "Accumulating evaluation results...\n",
+            "DONE (t=0.28s).\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=300 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = -1.000\n",
+            "06/15 06:10:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: -1.000 -1.000 -1.000 -1.000 -1.000 -1.000\n",
+            "06/15 06:10:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Results has been saved to data/multisports/annotations/ms_det_proposals.pkl.\n",
+            "06/15 06:10:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2350/2350]    coco/bbox_mAP_50: -1.0000  coco/bbox_AR@100: -1.0000  data_time: 0.0047  time: 0.1202\n",
+            "\u001b[32mTesting finished successfully.\u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "!mim test mmdet configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n",
+        "    --checkpoint work_dirs/det_model/epoch_2.pth \\\n",
+        "    --out data/multisports/annotations/ms_det_proposals.pkl"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jzWhc7ClooZ1"
+      },
+      "source": [
+        "## 3. Training the Spatio-temporal Action Detection Model\n",
+        "The provided annotation files and the proposal files generated by MMDetection need to be converted to the required format for training the spatiotemporal action detection model. We have provided relevant script to generate the specified format."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "W3slJsWHooZ2",
+        "outputId": "42a4b7be-91f8-4443-b693-ab40b743a14f"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "loading test result...\n",
+            "[>>] 2350/2350, 3799.7 task/s, elapsed: 1s, ETA:     0s\n",
+            "\u001b[01;34mdata/multisports/annotations\u001b[00m\n",
+            "├── label_map.txt\n",
+            "├── ms_det_proposals.pkl\n",
+            "├── ms_infer_anno.json\n",
+            "├── multisports_det_anno_train.json\n",
+            "├── multisports_det_anno_val.json\n",
+            "├── \u001b[01;32mmultisports_GT.pkl\u001b[00m\n",
+            "├── multisports_proposals_train.pkl\n",
+            "├── multisports_proposals_val.pkl\n",
+            "├── multisports_train.csv\n",
+            "└── multisports_val.csv\n",
+            "\n",
+            "0 directories, 10 files\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Convert annotation files\n",
+        "!python ../../tools/data/multisports/parse_anno.py\n",
+        "\n",
+        "# Convert proposal files\n",
+        "!python tools/convert_proposals.py\n",
+        "\n",
+        "!tree data/multisports/annotations"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yRSSHmw0ooZ2"
+      },
+      "source": [
+        "### 3.2 Training the Spatio-temporal Action Detection Model\n",
+        "\n",
+        "MMAction2 already supports training on the MultiSports dataset. You just need to modify the path to the proposal file. For detailed configurations, please refer to the [config](configs/slowonly_k400_multisports.py) file. Since the training data is limited, the configuration uses a pre-trained model trained on the complete MultiSports dataset. When training with a custom dataset, you don't need to specify the `load_from` configuration."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "vwaay7NvooZ2",
+        "outputId": "add60ddd-2a40-4356-b120-1e7940043778"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training command is /usr/bin/python3 /content/mmaction2/mmaction/.mim/tools/train.py configs/slowonly_k400_multisports.py --launcher none --work-dir work_dirs/stad_model/. \n",
+            "06/15 06:10:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 1735696538\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "    PyTorch: 2.0.1+cu118\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 9.3\n",
+            "  - C++ Version: 201703\n",
+            "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.8\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+            "  - CuDNN 8.7\n",
+            "  - Magma 2.6.1\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+            "\n",
+            "    TorchVision: 0.15.2+cu118\n",
+            "    OpenCV: 4.7.0\n",
+            "    MMEngine: 0.7.4\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 1735696538\n",
+            "    diff_rank_seed: False\n",
+            "    deterministic: False\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "06/15 06:10:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+            "default_scope = 'mmaction'\n",
+            "default_hooks = dict(\n",
+            "    runtime_info=dict(type='RuntimeInfoHook', _scope_='mmaction'),\n",
+            "    timer=dict(type='IterTimerHook', _scope_='mmaction'),\n",
+            "    logger=dict(\n",
+            "        type='LoggerHook', interval=20, ignore_last=False, _scope_='mmaction'),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook', _scope_='mmaction'),\n",
+            "    checkpoint=dict(\n",
+            "        type='CheckpointHook',\n",
+            "        interval=1,\n",
+            "        save_best='auto',\n",
+            "        _scope_='mmaction'),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook', _scope_='mmaction'),\n",
+            "    sync_buffers=dict(type='SyncBuffersHook', _scope_='mmaction'))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "log_processor = dict(\n",
+            "    type='LogProcessor', window_size=20, by_epoch=True, _scope_='mmaction')\n",
+            "vis_backends = [dict(type='LocalVisBackend', _scope_='mmaction')]\n",
+            "visualizer = dict(\n",
+            "    type='ActionVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    _scope_='mmaction')\n",
+            "log_level = 'INFO'\n",
+            "load_from = 'https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth'\n",
+            "resume = False\n",
+            "url = 'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth'\n",
+            "num_classes = 66\n",
+            "model = dict(\n",
+            "    type='FastRCNN',\n",
+            "    _scope_='mmdet',\n",
+            "    init_cfg=dict(\n",
+            "        type='Pretrained',\n",
+            "        checkpoint=\n",
+            "        'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth'\n",
+            "    ),\n",
+            "    backbone=dict(\n",
+            "        type='mmaction.ResNet3dSlowOnly',\n",
+            "        depth=50,\n",
+            "        pretrained=None,\n",
+            "        pretrained2d=False,\n",
+            "        lateral=False,\n",
+            "        num_stages=4,\n",
+            "        conv1_kernel=(1, 7, 7),\n",
+            "        conv1_stride_t=1,\n",
+            "        pool1_stride_t=1,\n",
+            "        spatial_strides=(1, 2, 2, 1)),\n",
+            "    roi_head=dict(\n",
+            "        type='AVARoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor3D',\n",
+            "            roi_layer_type='RoIAlign',\n",
+            "            output_size=8,\n",
+            "            with_temporal_pool=True),\n",
+            "        bbox_head=dict(\n",
+            "            type='BBoxHeadAVA',\n",
+            "            in_channels=2048,\n",
+            "            num_classes=66,\n",
+            "            multilabel=False,\n",
+            "            dropout_ratio=0.5)),\n",
+            "    data_preprocessor=dict(\n",
+            "        type='mmaction.ActionDataPreprocessor',\n",
+            "        mean=[123.675, 116.28, 103.53],\n",
+            "        std=[58.395, 57.12, 57.375],\n",
+            "        format_shape='NCTHW'),\n",
+            "    train_cfg=dict(\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssignerAVA',\n",
+            "                pos_iou_thr=0.9,\n",
+            "                neg_iou_thr=0.9,\n",
+            "                min_pos_iou=0.9),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=32,\n",
+            "                pos_fraction=1,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=1.0)),\n",
+            "    test_cfg=dict(rcnn=None))\n",
+            "dataset_type = 'AVADataset'\n",
+            "data_root = 'data/multisports/trainval'\n",
+            "anno_root = 'data/multisports/annotations'\n",
+            "ann_file_train = 'data/multisports/annotations/multisports_train.csv'\n",
+            "ann_file_val = 'data/multisports/annotations/multisports_val.csv'\n",
+            "gt_file = 'data/multisports/annotations/multisports_GT.pkl'\n",
+            "proposal_file_train = 'data/multisports/annotations/multisports_proposals_train.pkl'\n",
+            "proposal_file_val = 'data/multisports/annotations/multisports_proposals_val.pkl'\n",
+            "file_client_args = dict(io_backend='disk')\n",
+            "train_pipeline = [\n",
+            "    dict(type='DecordInit', io_backend='disk', _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='SampleAVAFrames',\n",
+            "        clip_len=4,\n",
+            "        frame_interval=16,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='DecordDecode', _scope_='mmaction'),\n",
+            "    dict(type='RandomRescale', scale_range=(256, 320), _scope_='mmaction'),\n",
+            "    dict(type='RandomCrop', size=256, _scope_='mmaction'),\n",
+            "    dict(type='Flip', flip_ratio=0.5, _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='FormatShape',\n",
+            "        input_format='NCTHW',\n",
+            "        collapse=True,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='PackActionInputs', _scope_='mmaction')\n",
+            "]\n",
+            "val_pipeline = [\n",
+            "    dict(type='DecordInit', io_backend='disk', _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='SampleAVAFrames',\n",
+            "        clip_len=4,\n",
+            "        frame_interval=16,\n",
+            "        test_mode=True,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='DecordDecode', _scope_='mmaction'),\n",
+            "    dict(type='Resize', scale=(-1, 256), _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='FormatShape',\n",
+            "        input_format='NCTHW',\n",
+            "        collapse=True,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='PackActionInputs', _scope_='mmaction')\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True, _scope_='mmaction'),\n",
+            "    dataset=dict(\n",
+            "        type='AVADataset',\n",
+            "        ann_file='data/multisports/annotations/multisports_train.csv',\n",
+            "        pipeline=[\n",
+            "            dict(type='DecordInit', io_backend='disk'),\n",
+            "            dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),\n",
+            "            dict(type='DecordDecode'),\n",
+            "            dict(type='RandomRescale', scale_range=(256, 320)),\n",
+            "            dict(type='RandomCrop', size=256),\n",
+            "            dict(type='Flip', flip_ratio=0.5),\n",
+            "            dict(type='FormatShape', input_format='NCTHW', collapse=True),\n",
+            "            dict(type='PackActionInputs')\n",
+            "        ],\n",
+            "        num_classes=66,\n",
+            "        proposal_file=\n",
+            "        'data/multisports/annotations/multisports_proposals_train.pkl',\n",
+            "        data_prefix=dict(img='data/multisports/trainval'),\n",
+            "        timestamp_start=1,\n",
+            "        start_index=0,\n",
+            "        use_frames=False,\n",
+            "        fps=1,\n",
+            "        _scope_='mmaction'))\n",
+            "val_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False, _scope_='mmaction'),\n",
+            "    dataset=dict(\n",
+            "        type='AVADataset',\n",
+            "        ann_file='data/multisports/annotations/multisports_val.csv',\n",
+            "        pipeline=[\n",
+            "            dict(type='DecordInit', io_backend='disk'),\n",
+            "            dict(\n",
+            "                type='SampleAVAFrames',\n",
+            "                clip_len=4,\n",
+            "                frame_interval=16,\n",
+            "                test_mode=True),\n",
+            "            dict(type='DecordDecode'),\n",
+            "            dict(type='Resize', scale=(-1, 256)),\n",
+            "            dict(type='FormatShape', input_format='NCTHW', collapse=True),\n",
+            "            dict(type='PackActionInputs')\n",
+            "        ],\n",
+            "        num_classes=66,\n",
+            "        proposal_file=\n",
+            "        'data/multisports/annotations/multisports_proposals_val.pkl',\n",
+            "        data_prefix=dict(img='data/multisports/trainval'),\n",
+            "        test_mode=True,\n",
+            "        timestamp_start=1,\n",
+            "        start_index=0,\n",
+            "        use_frames=False,\n",
+            "        fps=1,\n",
+            "        _scope_='mmaction'))\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=8,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False, _scope_='mmaction'),\n",
+            "    dataset=dict(\n",
+            "        type='AVADataset',\n",
+            "        ann_file='data/multisports/annotations/multisports_val.csv',\n",
+            "        pipeline=[\n",
+            "            dict(type='DecordInit', io_backend='disk'),\n",
+            "            dict(\n",
+            "                type='SampleAVAFrames',\n",
+            "                clip_len=4,\n",
+            "                frame_interval=16,\n",
+            "                test_mode=True),\n",
+            "            dict(type='DecordDecode'),\n",
+            "            dict(type='Resize', scale=(-1, 256)),\n",
+            "            dict(type='FormatShape', input_format='NCTHW', collapse=True),\n",
+            "            dict(type='PackActionInputs')\n",
+            "        ],\n",
+            "        num_classes=66,\n",
+            "        proposal_file=\n",
+            "        'data/multisports/annotations/multisports_dense_proposals_val.recall_96.13.pkl',\n",
+            "        data_prefix=dict(img='data/multisports/trainval'),\n",
+            "        test_mode=True,\n",
+            "        timestamp_start=1,\n",
+            "        start_index=0,\n",
+            "        use_frames=False,\n",
+            "        fps=1,\n",
+            "        _scope_='mmaction'))\n",
+            "val_evaluator = dict(\n",
+            "    type='MultiSportsMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_GT.pkl',\n",
+            "    _scope_='mmaction')\n",
+            "test_evaluator = dict(\n",
+            "    type='MultiSportsMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_GT.pkl',\n",
+            "    _scope_='mmaction')\n",
+            "train_cfg = dict(\n",
+            "    type='EpochBasedTrainLoop',\n",
+            "    max_epochs=8,\n",
+            "    val_begin=1,\n",
+            "    val_interval=1,\n",
+            "    _scope_='mmaction')\n",
+            "val_cfg = dict(type='ValLoop', _scope_='mmaction')\n",
+            "test_cfg = dict(type='TestLoop', _scope_='mmaction')\n",
+            "param_scheduler = [\n",
+            "    dict(\n",
+            "        type='LinearLR',\n",
+            "        start_factor=0.1,\n",
+            "        by_epoch=True,\n",
+            "        begin=0,\n",
+            "        end=5,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='MultiStepLR',\n",
+            "        begin=0,\n",
+            "        end=8,\n",
+            "        by_epoch=True,\n",
+            "        milestones=[6, 7],\n",
+            "        gamma=0.1,\n",
+            "        _scope_='mmaction')\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    optimizer=dict(\n",
+            "        type='SGD',\n",
+            "        lr=0.01,\n",
+            "        momentum=0.9,\n",
+            "        weight_decay=1e-05,\n",
+            "        _scope_='mmaction'),\n",
+            "    clip_grad=dict(max_norm=5, norm_type=2))\n",
+            "launcher = 'none'\n",
+            "work_dir = 'work_dirs/stad_model/'\n",
+            "randomness = dict(seed=None, diff_rank_seed=False, deterministic=False)\n",
+            "\n",
+            "06/15 06:10:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "06/15 06:10:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+            "before_run:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "before_train:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_train_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DistSamplerSeedHook                \n",
+            " -------------------- \n",
+            "before_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) SyncBuffersHook                    \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_val_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) SyncBuffersHook                    \n",
+            " -------------------- \n",
+            "before_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_val_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train:\n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_test_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_test_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_run:\n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "06/15 06:10:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - 236 out of 236 frames are valid.\n",
+            "06/15 06:10:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - 120 out of 120 frames are valid.\n",
+            "06/15 06:10:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - load model from: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n",
+            "06/15 06:10:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n",
+            "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\" to /root/.cache/torch/hub/checkpoints/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n",
+            "100% 124M/124M [00:01<00:00, 103MB/s]\n",
+            "06/15 06:10:28 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
+            "\n",
+            "unexpected key in source state_dict: cls_head.fc_cls.weight, cls_head.fc_cls.bias\n",
+            "\n",
+            "missing keys in source state_dict: roi_head.bbox_head.fc_cls.weight, roi_head.bbox_head.fc_cls.bias\n",
+            "\n",
+            "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\" to /root/.cache/torch/hub/checkpoints/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "100% 122M/122M [00:03<00:00, 36.1MB/s]\n",
+            "06/15 06:10:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "06/15 06:10:32 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n",
+            "06/15 06:10:32 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n",
+            "06/15 06:10:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Checkpoints will be saved to /content/mmaction2/projects/stad_tutorial/work_dirs/stad_model.\n",
+            "06/15 06:10:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 20/118]  lr: 1.0000e-03  eta: 0:06:07  time: 0.3982  data_time: 0.0431  memory: 1383  grad_norm: 13.0844  loss: 1.3834  recall@thr=0.5: 0.5385  prec@thr=0.5: 0.5385  recall@top3: 0.8462  prec@top3: 0.2821  recall@top5: 0.8462  prec@top5: 0.1692  loss_action_cls: 1.3834\n",
+            "06/15 06:10:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 40/118]  lr: 1.0000e-03  eta: 0:05:32  time: 0.3383  data_time: 0.0732  memory: 1383  grad_norm: 4.6786  loss: 0.6001  recall@thr=0.5: 0.9444  prec@thr=0.5: 0.9444  recall@top3: 0.9444  prec@top3: 0.3148  recall@top5: 0.9444  prec@top5: 0.1889  loss_action_cls: 0.6001\n",
+            "06/15 06:10:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 60/118]  lr: 1.0000e-03  eta: 0:04:59  time: 0.2784  data_time: 0.0300  memory: 1383  grad_norm: 2.9446  loss: 0.5144  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.5144\n",
+            "06/15 06:10:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 80/118]  lr: 1.0000e-03  eta: 0:04:36  time: 0.2646  data_time: 0.0144  memory: 1383  grad_norm: 1.7695  loss: 0.4988  recall@thr=0.5: 0.6923  prec@thr=0.5: 0.6923  recall@top3: 0.6923  prec@top3: 0.2308  recall@top5: 0.6923  prec@top5: 0.1385  loss_action_cls: 0.4988\n",
+            "06/15 06:11:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][100/118]  lr: 1.0000e-03  eta: 0:04:35  time: 0.3502  data_time: 0.0839  memory: 1383  grad_norm: 2.4095  loss: 0.3218  recall@thr=0.5: 0.9333  prec@thr=0.5: 0.9333  recall@top3: 0.9333  prec@top3: 0.3111  recall@top5: 0.9333  prec@top5: 0.1867  loss_action_cls: 0.3218\n",
+            "06/15 06:11:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:11:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][118/118]  lr: 1.0000e-03  eta: 0:04:20  time: 0.2563  data_time: 0.0102  memory: 1383  grad_norm: 1.8156  loss: 0.3895  recall@thr=0.5: 0.8125  prec@thr=0.5: 0.8125  recall@top3: 0.9375  prec@top3: 0.3125  recall@top5: 0.9375  prec@top5: 0.1875  loss_action_cls: 0.3895\n",
+            "06/15 06:11:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 1 epochs\n",
+            "06/15 06:11:14 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 20/120]    eta: 0:00:16  time: 0.1669  data_time: 0.1073  memory: 466  \n",
+            "06/15 06:11:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 40/120]    eta: 0:00:13  time: 0.1698  data_time: 0.1145  memory: 466  \n",
+            "06/15 06:11:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 60/120]    eta: 0:00:09  time: 0.1428  data_time: 0.0896  memory: 466  \n",
+            "06/15 06:11:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 80/120]    eta: 0:00:05  time: 0.0998  data_time: 0.0504  memory: 466  \n",
+            "06/15 06:11:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][100/120]    eta: 0:00:02  time: 0.1122  data_time: 0.0612  memory: 466  \n",
+            "06/15 06:11:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120]    eta: 0:00:00  time: 0.1031  data_time: 0.0528  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    59.66\n",
+            "aerobic split jump      30.80\n",
+            "aerobic scissors leap    88.34\n",
+            "aerobic turn            98.48\n",
+            "mAP                     69.32\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump    25.00\n",
+            "aerobic split jump      20.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     56.25\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump    25.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    50.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     43.75\n",
+            "06/15 06:11:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120]    mAP/frameAP: 69.3181  mAP/v_map@0.2: 56.2500  mAP/v_map@0.5: 43.7500  mAP/v_map_0.05:0.45: 55.1389  mAP/v_map_0.10:0.90: 41.2500  mAP/v_map_0.50:0.95: 28.1750  data_time: 0.0793  time: 0.1324\n",
+            "06/15 06:11:29 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - The best checkpoint with 69.3181 mAP/frameAP at 1 epoch is saved to best_mAP_frameAP_epoch_1.pth.\n",
+            "06/15 06:11:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 20/118]  lr: 3.2500e-03  eta: 0:04:10  time: 0.2884  data_time: 0.0401  memory: 1383  grad_norm: 1.3823  loss: 0.3596  recall@thr=0.5: 0.6923  prec@thr=0.5: 0.6923  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3596\n",
+            "06/15 06:11:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 40/118]  lr: 3.2500e-03  eta: 0:04:00  time: 0.2728  data_time: 0.0204  memory: 1383  grad_norm: 1.2185  loss: 0.5274  recall@thr=0.5: 0.9333  prec@thr=0.5: 0.9333  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.5274\n",
+            "06/15 06:11:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 60/118]  lr: 3.2500e-03  eta: 0:03:56  time: 0.3296  data_time: 0.0699  memory: 1383  grad_norm: 1.7120  loss: 0.3599  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3599\n",
+            "06/15 06:11:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 80/118]  lr: 3.2500e-03  eta: 0:03:46  time: 0.2584  data_time: 0.0120  memory: 1383  grad_norm: 1.7462  loss: 0.2598  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2598\n",
+            "06/15 06:12:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][100/118]  lr: 3.2500e-03  eta: 0:03:39  time: 0.2858  data_time: 0.0263  memory: 1383  grad_norm: 0.8975  loss: 0.3959  recall@thr=0.5: 0.7692  prec@thr=0.5: 0.7692  recall@top3: 0.9231  prec@top3: 0.3077  recall@top5: 0.9231  prec@top5: 0.1846  loss_action_cls: 0.3959\n",
+            "06/15 06:12:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:12:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][118/118]  lr: 3.2500e-03  eta: 0:03:35  time: 0.3381  data_time: 0.0807  memory: 1383  grad_norm: 0.5466  loss: 0.4871  recall@thr=0.5: 0.8333  prec@thr=0.5: 0.8333  recall@top3: 0.8333  prec@top3: 0.2778  recall@top5: 0.8333  prec@top5: 0.1667  loss_action_cls: 0.4871\n",
+            "06/15 06:12:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 2 epochs\n",
+            "06/15 06:12:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 20/120]    eta: 0:00:12  time: 0.1230  data_time: 0.0693  memory: 466  \n",
+            "06/15 06:12:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 40/120]    eta: 0:00:09  time: 0.1138  data_time: 0.0632  memory: 466  \n",
+            "06/15 06:12:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 60/120]    eta: 0:00:07  time: 0.1214  data_time: 0.0672  memory: 466  \n",
+            "06/15 06:12:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 80/120]    eta: 0:00:05  time: 0.1539  data_time: 0.1001  memory: 466  \n",
+            "06/15 06:12:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][100/120]    eta: 0:00:02  time: 0.1488  data_time: 0.0936  memory: 466  \n",
+            "06/15 06:12:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120]    eta: 0:00:00  time: 0.1030  data_time: 0.0539  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    39.91\n",
+            "aerobic split jump      29.66\n",
+            "aerobic scissors leap    90.70\n",
+            "aerobic turn            96.92\n",
+            "mAP                     64.30\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump      20.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     55.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     34.00\n",
+            "06/15 06:12:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120]    mAP/frameAP: 64.2982  mAP/v_map@0.2: 55.0000  mAP/v_map@0.5: 34.0000  mAP/v_map_0.05:0.45: 53.8889  mAP/v_map_0.10:0.90: 34.5833  mAP/v_map_0.50:0.95: 19.1250  data_time: 0.0744  time: 0.1270\n",
+            "06/15 06:12:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 20/118]  lr: 5.5000e-03  eta: 0:03:28  time: 0.2786  data_time: 0.0358  memory: 1383  grad_norm: 1.0935  loss: 0.3780  recall@thr=0.5: 0.8667  prec@thr=0.5: 0.8667  recall@top3: 0.8667  prec@top3: 0.2889  recall@top5: 0.8667  prec@top5: 0.1733  loss_action_cls: 0.3780\n",
+            "06/15 06:12:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 40/118]  lr: 5.5000e-03  eta: 0:03:22  time: 0.3217  data_time: 0.0573  memory: 1383  grad_norm: 1.4278  loss: 0.3261  recall@thr=0.5: 0.8750  prec@thr=0.5: 0.8750  recall@top3: 0.9375  prec@top3: 0.3125  recall@top5: 0.9375  prec@top5: 0.1875  loss_action_cls: 0.3261\n",
+            "06/15 06:12:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 60/118]  lr: 5.5000e-03  eta: 0:03:15  time: 0.2823  data_time: 0.0358  memory: 1383  grad_norm: 0.6230  loss: 0.4514  recall@thr=0.5: 0.9286  prec@thr=0.5: 0.9286  recall@top3: 0.9286  prec@top3: 0.3095  recall@top5: 0.9286  prec@top5: 0.1857  loss_action_cls: 0.4514\n",
+            "06/15 06:12:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 80/118]  lr: 5.5000e-03  eta: 0:03:08  time: 0.2561  data_time: 0.0115  memory: 1383  grad_norm: 0.1768  loss: 0.3241  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3241\n",
+            "06/15 06:12:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][100/118]  lr: 5.5000e-03  eta: 0:03:02  time: 0.3094  data_time: 0.0422  memory: 1383  grad_norm: 0.4979  loss: 0.4081  recall@thr=0.5: 0.8333  prec@thr=0.5: 0.8333  recall@top3: 0.8333  prec@top3: 0.2778  recall@top5: 0.8333  prec@top5: 0.1667  loss_action_cls: 0.4081\n",
+            "06/15 06:13:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:13:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][118/118]  lr: 5.5000e-03  eta: 0:02:56  time: 0.2776  data_time: 0.0266  memory: 1383  grad_norm: 0.7488  loss: 0.4131  recall@thr=0.5: 0.6667  prec@thr=0.5: 0.6667  recall@top3: 0.6667  prec@top3: 0.2222  recall@top5: 0.6667  prec@top5: 0.1333  loss_action_cls: 0.4131\n",
+            "06/15 06:13:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 3 epochs\n",
+            "06/15 06:13:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 20/120]    eta: 0:00:11  time: 0.1182  data_time: 0.0691  memory: 466  \n",
+            "06/15 06:13:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 40/120]    eta: 0:00:09  time: 0.1132  data_time: 0.0628  memory: 466  \n",
+            "06/15 06:13:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 60/120]    eta: 0:00:07  time: 0.1542  data_time: 0.0996  memory: 466  \n",
+            "06/15 06:13:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 80/120]    eta: 0:00:05  time: 0.1479  data_time: 0.0937  memory: 466  \n",
+            "06/15 06:13:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][100/120]    eta: 0:00:02  time: 0.1232  data_time: 0.0726  memory: 466  \n",
+            "06/15 06:13:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][120/120]    eta: 0:00:00  time: 0.1029  data_time: 0.0529  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    29.65\n",
+            "aerobic split jump      20.83\n",
+            "aerobic scissors leap    90.63\n",
+            "aerobic turn            97.10\n",
+            "mAP                     59.55\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     50.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     34.00\n",
+            "06/15 06:13:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][120/120]    mAP/frameAP: 59.5538  mAP/v_map@0.2: 50.0000  mAP/v_map@0.5: 34.0000  mAP/v_map_0.05:0.45: 50.0000  mAP/v_map_0.10:0.90: 32.9167  mAP/v_map_0.50:0.95: 19.1250  data_time: 0.0750  time: 0.1264\n",
+            "06/15 06:13:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 20/118]  lr: 7.7500e-03  eta: 0:02:50  time: 0.3089  data_time: 0.0514  memory: 1383  grad_norm: 0.2046  loss: 0.3238  recall@thr=0.5: 0.9091  prec@thr=0.5: 0.9091  recall@top3: 0.9091  prec@top3: 0.3030  recall@top5: 0.9091  prec@top5: 0.1818  loss_action_cls: 0.3238\n",
+            "06/15 06:13:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 40/118]  lr: 7.7500e-03  eta: 0:02:46  time: 0.3790  data_time: 0.0937  memory: 1383  grad_norm: 0.7468  loss: 0.4123  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4123\n",
+            "06/15 06:13:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 60/118]  lr: 7.7500e-03  eta: 0:02:39  time: 0.2685  data_time: 0.0171  memory: 1383  grad_norm: 0.1904  loss: 0.4407  recall@thr=0.5: 0.6667  prec@thr=0.5: 0.6667  recall@top3: 0.6667  prec@top3: 0.2222  recall@top5: 0.6667  prec@top5: 0.1333  loss_action_cls: 0.4407\n",
+            "06/15 06:13:42 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 80/118]  lr: 7.7500e-03  eta: 0:02:32  time: 0.2546  data_time: 0.0100  memory: 1383  grad_norm: 0.1966  loss: 0.4266  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4266\n",
+            "06/15 06:13:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][100/118]  lr: 7.7500e-03  eta: 0:02:27  time: 0.3283  data_time: 0.0548  memory: 1383  grad_norm: 0.3165  loss: 0.3308  recall@thr=0.5: 0.8000  prec@thr=0.5: 0.8000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3308\n",
+            "06/15 06:13:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:13:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][118/118]  lr: 7.7500e-03  eta: 0:02:21  time: 0.2671  data_time: 0.0151  memory: 1383  grad_norm: 0.1487  loss: 0.3003  recall@thr=0.5: 0.8333  prec@thr=0.5: 0.8333  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3003\n",
+            "06/15 06:13:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 4 epochs\n",
+            "06/15 06:13:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 20/120]    eta: 0:00:12  time: 0.1273  data_time: 0.0729  memory: 466  \n",
+            "06/15 06:14:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 40/120]    eta: 0:00:10  time: 0.1306  data_time: 0.0797  memory: 466  \n",
+            "06/15 06:14:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 60/120]    eta: 0:00:08  time: 0.1539  data_time: 0.0979  memory: 466  \n",
+            "06/15 06:14:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 80/120]    eta: 0:00:05  time: 0.1355  data_time: 0.0815  memory: 466  \n",
+            "06/15 06:14:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][100/120]    eta: 0:00:02  time: 0.1132  data_time: 0.0646  memory: 466  \n",
+            "06/15 06:14:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][120/120]    eta: 0:00:00  time: 0.1050  data_time: 0.0553  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    23.92\n",
+            "aerobic split jump      19.60\n",
+            "aerobic scissors leap    91.02\n",
+            "aerobic turn            96.05\n",
+            "mAP                     57.64\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     50.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     34.00\n",
+            "06/15 06:14:11 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][120/120]    mAP/frameAP: 57.6444  mAP/v_map@0.2: 50.0000  mAP/v_map@0.5: 34.0000  mAP/v_map_0.05:0.45: 50.0000  mAP/v_map_0.10:0.90: 32.9167  mAP/v_map_0.50:0.95: 18.3250  data_time: 0.0753  time: 0.1274\n",
+            "06/15 06:14:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 20/118]  lr: 1.0000e-02  eta: 0:02:14  time: 0.2810  data_time: 0.0329  memory: 1383  grad_norm: 0.6113  loss: 0.4312  recall@thr=0.5: 0.8182  prec@thr=0.5: 0.8182  recall@top3: 0.8182  prec@top3: 0.2727  recall@top5: 0.8182  prec@top5: 0.1636  loss_action_cls: 0.4312\n",
+            "06/15 06:14:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 40/118]  lr: 1.0000e-02  eta: 0:02:09  time: 0.3316  data_time: 0.0732  memory: 1383  grad_norm: 0.2282  loss: 0.3932  recall@thr=0.5: 0.8182  prec@thr=0.5: 0.8182  recall@top3: 0.8182  prec@top3: 0.2727  recall@top5: 0.8182  prec@top5: 0.1636  loss_action_cls: 0.3932\n",
+            "06/15 06:14:29 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 60/118]  lr: 1.0000e-02  eta: 0:02:03  time: 0.2738  data_time: 0.0286  memory: 1383  grad_norm: 0.2938  loss: 0.3828  recall@thr=0.5: 0.8571  prec@thr=0.5: 0.8571  recall@top3: 0.8571  prec@top3: 0.2857  recall@top5: 0.8571  prec@top5: 0.1714  loss_action_cls: 0.3828\n",
+            "06/15 06:14:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 80/118]  lr: 1.0000e-02  eta: 0:01:56  time: 0.2756  data_time: 0.0192  memory: 1383  grad_norm: 0.1112  loss: 0.3722  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3722\n",
+            "06/15 06:14:41 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][100/118]  lr: 1.0000e-02  eta: 0:01:51  time: 0.3193  data_time: 0.0573  memory: 1383  grad_norm: 0.6399  loss: 0.4427  recall@thr=0.5: 0.8000  prec@thr=0.5: 0.8000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4427\n",
+            "06/15 06:14:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:14:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][118/118]  lr: 1.0000e-02  eta: 0:01:45  time: 0.2535  data_time: 0.0093  memory: 1383  grad_norm: 0.0985  loss: 0.2719  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2719\n",
+            "06/15 06:14:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 5 epochs\n",
+            "06/15 06:14:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 20/120]    eta: 0:00:13  time: 0.1329  data_time: 0.0774  memory: 466  \n",
+            "06/15 06:14:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 40/120]    eta: 0:00:12  time: 0.1787  data_time: 0.1259  memory: 466  \n",
+            "06/15 06:14:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 60/120]    eta: 0:00:08  time: 0.1363  data_time: 0.0829  memory: 466  \n",
+            "06/15 06:14:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 80/120]    eta: 0:00:05  time: 0.1012  data_time: 0.0513  memory: 466  \n",
+            "06/15 06:15:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][100/120]    eta: 0:00:02  time: 0.1095  data_time: 0.0593  memory: 466  \n",
+            "06/15 06:15:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][120/120]    eta: 0:00:00  time: 0.1033  data_time: 0.0536  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    14.21\n",
+            "aerobic split jump      15.37\n",
+            "aerobic scissors leap    91.25\n",
+            "aerobic turn            91.43\n",
+            "mAP                     53.06\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn            80.00\n",
+            "mAP                     45.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn            20.00\n",
+            "mAP                     14.00\n",
+            "06/15 06:15:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][120/120]    mAP/frameAP: 53.0627  mAP/v_map@0.2: 45.0000  mAP/v_map@0.5: 14.0000  mAP/v_map_0.05:0.45: 40.0000  mAP/v_map_0.10:0.90: 22.4444  mAP/v_map_0.50:0.95: 7.0250  data_time: 0.0749  time: 0.1267\n",
+            "06/15 06:15:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 20/118]  lr: 1.0000e-02  eta: 0:01:39  time: 0.3193  data_time: 0.0634  memory: 1383  grad_norm: 0.5229  loss: 0.3929  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3929\n",
+            "06/15 06:15:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 40/118]  lr: 1.0000e-02  eta: 0:01:33  time: 0.2972  data_time: 0.0439  memory: 1383  grad_norm: 0.4621  loss: 0.2891  recall@thr=0.5: 0.7692  prec@thr=0.5: 0.7692  recall@top3: 0.9231  prec@top3: 0.3077  recall@top5: 0.9231  prec@top5: 0.1846  loss_action_cls: 0.2891\n",
+            "06/15 06:15:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 60/118]  lr: 1.0000e-02  eta: 0:01:27  time: 0.2567  data_time: 0.0127  memory: 1383  grad_norm: 0.2534  loss: 0.3438  recall@thr=0.5: 0.9333  prec@thr=0.5: 0.9333  recall@top3: 0.9333  prec@top3: 0.3111  recall@top5: 0.9333  prec@top5: 0.1867  loss_action_cls: 0.3438\n",
+            "06/15 06:15:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 80/118]  lr: 1.0000e-02  eta: 0:01:21  time: 0.3277  data_time: 0.0645  memory: 1383  grad_norm: 0.0856  loss: 0.1859  recall@thr=0.5: 0.8571  prec@thr=0.5: 0.8571  recall@top3: 0.8571  prec@top3: 0.2857  recall@top5: 0.8571  prec@top5: 0.1714  loss_action_cls: 0.1859\n",
+            "06/15 06:15:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][100/118]  lr: 1.0000e-02  eta: 0:01:15  time: 0.2995  data_time: 0.0503  memory: 1383  grad_norm: 0.3619  loss: 0.3205  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3205\n",
+            "06/15 06:15:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:15:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][118/118]  lr: 1.0000e-02  eta: 0:01:10  time: 0.2619  data_time: 0.0190  memory: 1383  grad_norm: 0.3812  loss: 0.3911  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3911\n",
+            "06/15 06:15:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 6 epochs\n",
+            "06/15 06:15:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 20/120]    eta: 0:00:17  time: 0.1739  data_time: 0.1178  memory: 466  \n",
+            "06/15 06:15:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 40/120]    eta: 0:00:13  time: 0.1519  data_time: 0.1032  memory: 466  \n",
+            "06/15 06:15:48 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 60/120]    eta: 0:00:08  time: 0.1031  data_time: 0.0536  memory: 466  \n",
+            "06/15 06:15:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 80/120]    eta: 0:00:05  time: 0.0998  data_time: 0.0505  memory: 466  \n",
+            "06/15 06:15:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][100/120]    eta: 0:00:02  time: 0.1126  data_time: 0.0620  memory: 466  \n",
+            "06/15 06:15:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][120/120]    eta: 0:00:00  time: 0.0995  data_time: 0.0506  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    10.49\n",
+            "aerobic split jump      14.53\n",
+            "aerobic scissors leap    90.24\n",
+            "aerobic turn            87.53\n",
+            "mAP                     50.70\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn            40.00\n",
+            "mAP                     35.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn            40.00\n",
+            "mAP                     19.00\n",
+            "06/15 06:15:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][120/120]    mAP/frameAP: 50.6970  mAP/v_map@0.2: 35.0000  mAP/v_map@0.5: 19.0000  mAP/v_map_0.05:0.45: 35.0000  mAP/v_map_0.10:0.90: 20.7778  mAP/v_map_0.50:0.95: 8.4000  data_time: 0.0724  time: 0.1229\n",
+            "06/15 06:16:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 20/118]  lr: 1.0000e-03  eta: 0:01:04  time: 0.3578  data_time: 0.0847  memory: 1383  grad_norm: 0.5369  loss: 0.3628  recall@thr=0.5: 0.9167  prec@thr=0.5: 0.9167  recall@top3: 0.9167  prec@top3: 0.3056  recall@top5: 0.9167  prec@top5: 0.1833  loss_action_cls: 0.3628\n",
+            "06/15 06:16:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 40/118]  lr: 1.0000e-03  eta: 0:00:58  time: 0.2652  data_time: 0.0202  memory: 1383  grad_norm: 0.1603  loss: 0.2293  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2293\n",
+            "06/15 06:16:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 60/118]  lr: 1.0000e-03  eta: 0:00:52  time: 0.2710  data_time: 0.0178  memory: 1383  grad_norm: 0.3857  loss: 0.2737  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2737\n",
+            "06/15 06:16:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 80/118]  lr: 1.0000e-03  eta: 0:00:46  time: 0.3420  data_time: 0.0698  memory: 1383  grad_norm: 0.1271  loss: 0.2149  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2149\n",
+            "06/15 06:16:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][100/118]  lr: 1.0000e-03  eta: 0:00:40  time: 0.2673  data_time: 0.0232  memory: 1383  grad_norm: 0.0990  loss: 0.2749  recall@thr=0.5: 0.8571  prec@thr=0.5: 0.8571  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2749\n",
+            "06/15 06:16:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:16:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][118/118]  lr: 1.0000e-03  eta: 0:00:34  time: 0.2612  data_time: 0.0156  memory: 1383  grad_norm: 0.1387  loss: 0.3211  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3211\n",
+            "06/15 06:16:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 7 epochs\n",
+            "06/15 06:16:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 20/120]    eta: 0:00:16  time: 0.1657  data_time: 0.1063  memory: 466  \n",
+            "06/15 06:16:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 40/120]    eta: 0:00:11  time: 0.1164  data_time: 0.0654  memory: 466  \n",
+            "06/15 06:16:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 60/120]    eta: 0:00:07  time: 0.1053  data_time: 0.0546  memory: 466  \n",
+            "06/15 06:16:42 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 80/120]    eta: 0:00:04  time: 0.1005  data_time: 0.0511  memory: 466  \n",
+            "06/15 06:16:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][100/120]    eta: 0:00:02  time: 0.1035  data_time: 0.0533  memory: 466  \n",
+            "06/15 06:16:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][120/120]    eta: 0:00:00  time: 0.1382  data_time: 0.0850  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    11.65\n",
+            "aerobic split jump      15.62\n",
+            "aerobic scissors leap    89.83\n",
+            "aerobic turn            93.96\n",
+            "mAP                     52.77\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn            80.00\n",
+            "mAP                     45.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    38.67\n",
+            "aerobic turn            20.00\n",
+            "mAP                     14.67\n",
+            "06/15 06:16:48 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][120/120]    mAP/frameAP: 52.7652  mAP/v_map@0.2: 45.0000  mAP/v_map@0.5: 14.6667  mAP/v_map_0.05:0.45: 40.6944  mAP/v_map_0.10:0.90: 22.6389  mAP/v_map_0.50:0.95: 6.6833  data_time: 0.0691  time: 0.1213\n",
+            "06/15 06:16:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 20/118]  lr: 1.0000e-04  eta: 0:00:29  time: 0.3243  data_time: 0.0649  memory: 1383  grad_norm: 0.1808  loss: 0.3648  recall@thr=0.5: 0.8571  prec@thr=0.5: 0.8571  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3648\n",
+            "06/15 06:16:59 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 40/118]  lr: 1.0000e-04  eta: 0:00:23  time: 0.2578  data_time: 0.0117  memory: 1383  grad_norm: 0.0784  loss: 0.2355  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2355\n",
+            "06/15 06:17:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 60/118]  lr: 1.0000e-04  eta: 0:00:17  time: 0.3075  data_time: 0.0490  memory: 1383  grad_norm: 0.1707  loss: 0.3776  recall@thr=0.5: 0.9333  prec@thr=0.5: 0.9333  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3776\n",
+            "06/15 06:17:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 80/118]  lr: 1.0000e-04  eta: 0:00:11  time: 0.3092  data_time: 0.0576  memory: 1383  grad_norm: 0.1387  loss: 0.3873  recall@thr=0.5: 0.8182  prec@thr=0.5: 0.8182  recall@top3: 0.8182  prec@top3: 0.2727  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3873\n",
+            "06/15 06:17:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][100/118]  lr: 1.0000e-04  eta: 0:00:05  time: 0.2578  data_time: 0.0100  memory: 1383  grad_norm: 0.2137  loss: 0.3337  recall@thr=0.5: 0.8462  prec@thr=0.5: 0.8462  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3337\n",
+            "06/15 06:17:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:17:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][118/118]  lr: 1.0000e-04  eta: 0:00:00  time: 0.2755  data_time: 0.0148  memory: 1383  grad_norm: 0.0712  loss: 0.2038  recall@thr=0.5: 0.9091  prec@thr=0.5: 0.9091  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2038\n",
+            "06/15 06:17:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 8 epochs\n",
+            "06/15 06:17:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 20/120]    eta: 0:00:11  time: 0.1180  data_time: 0.0649  memory: 466  \n",
+            "06/15 06:17:29 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 40/120]    eta: 0:00:09  time: 0.1168  data_time: 0.0667  memory: 466  \n",
+            "06/15 06:17:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 60/120]    eta: 0:00:06  time: 0.1026  data_time: 0.0535  memory: 466  \n",
+            "06/15 06:17:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 80/120]    eta: 0:00:04  time: 0.1017  data_time: 0.0533  memory: 466  \n",
+            "06/15 06:17:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][100/120]    eta: 0:00:02  time: 0.1444  data_time: 0.0915  memory: 466  \n",
+            "06/15 06:17:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][120/120]    eta: 0:00:00  time: 0.1496  data_time: 0.0962  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    11.34\n",
+            "aerobic split jump      12.82\n",
+            "aerobic scissors leap    90.68\n",
+            "aerobic turn            90.47\n",
+            "mAP                     51.33\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn            80.00\n",
+            "mAP                     45.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    72.00\n",
+            "aerobic turn            20.00\n",
+            "mAP                     23.00\n",
+            "06/15 06:17:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][120/120]    mAP/frameAP: 51.3281  mAP/v_map@0.2: 45.0000  mAP/v_map@0.5: 23.0000  mAP/v_map_0.05:0.45: 40.0000  mAP/v_map_0.10:0.90: 24.4444  mAP/v_map_0.50:0.95: 9.7250  data_time: 0.0704  time: 0.1216\n",
+            "\u001b[32mTraining finished successfully. \u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Train the model using MIM\n",
+        "!mim train mmaction2 configs/slowonly_k400_multisports.py \\\n",
+        "    --work-dir work_dirs/stad_model/"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yVjHqupPooZ2"
+      },
+      "source": [
+        "## 4. Inferring the Spatiotemporal Action Detection Model\n",
+        "\n",
+        "After training the detection model and the spatiotemporal action detection model, we can use the spatiotemporal action detection demo for inference and visualize the model's performance.\n",
+        "\n",
+        "Since the tutorial uses a limited training dataset, the model's performance is not optimal, so a pre-trained model is used for visualization."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "NQF1yrEhooZ3",
+        "outputId": "5331fbb6-7075-415c-f6f0-ec41c4b584a4"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "ALSA lib confmisc.c:767:(parse_card) cannot find card '0'\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory\n",
+            "ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory\n",
+            "ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default\n",
+            "ALSA lib confmisc.c:767:(parse_card) cannot find card '0'\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory\n",
+            "ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory\n",
+            "ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default\n",
+            "Loads checkpoint by local backend from path: work_dirs/det_model/epoch_2.pth\n",
+            "Performing Human Detection for each frame\n",
+            "[>>] 99/99, 7.0 task/s, elapsed: 14s, ETA:     0s\n",
+            "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "Performing SpatioTemporal Action Detection for each clip\n",
+            "[>>] 99/99, 17.1 task/s, elapsed: 6s, ETA:     0sPerforming visualization\n",
+            "Moviepy - Building video data/demo_spatiotemporal_det.mp4.\n",
+            "Moviepy - Writing video data/demo_spatiotemporal_det.mp4\n",
+            "\n",
+            "Moviepy - Done !\n",
+            "Moviepy - video ready data/demo_spatiotemporal_det.mp4\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python ../../demo/demo_spatiotemporal_det.py \\\n",
+        "    data/multisports/test/aerobic_gymnastics/v_7G_IpU0FxLU_c001.mp4 \\\n",
+        "    data/demo_spatiotemporal_det.mp4 \\\n",
+        "    --config configs/slowonly_k400_multisports.py \\\n",
+        "    --checkpoint https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth \\\n",
+        "    --det-config configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n",
+        "    --det-checkpoint work_dirs/det_model/epoch_2.pth \\\n",
+        "    --det-score-thr 0.85 \\\n",
+        "    --action-score-thr 0.8 \\\n",
+        "    --label-map ../../tools/data/multisports/label_map.txt \\\n",
+        "    --predict-stepsize 8 \\\n",
+        "    --output-stepsize 1 \\\n",
+        "    --output-fps 24"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 741
+        },
+        "id": "9JmeIkh5ooZ3",
+        "outputId": "7fc38469-d8c4-4a02-81e7-ff93b88a62b2"
+      },
+      "outputs": [],
+      "source": [
+        "# Show Video\n",
+        "import moviepy.editor\n",
+        "moviepy.editor.ipython_display(\"data/demo_spatiotemporal_det.mp4\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "ipy_stad",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.0"
+    },
+    "orig_nbformat": 4
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/projects/stad_tutorial/demo_stad_zh_CN.ipynb b/projects/stad_tutorial/demo_stad_zh_CN.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..e095ebaa9cc3d606ef473755349b90e265bac85c
--- /dev/null
+++ b/projects/stad_tutorial/demo_stad_zh_CN.ipynb
@@ -0,0 +1,4107 @@
+{
+  "cells": [
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "B74HkZjCxQ_6"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/open-mmlab/mmaction2/projects/stad_tutorial/demo_stad_zh_CN.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MwmrGv9exRAH"
+      },
+      "source": [
+        "# 基于 MMAction2 进行时空行为检测任务\n",
+        "欢迎使用 MMAction2! 这是一篇关于如何使用 MMAction2 进行时空行为检测的教程。在此教程中，我们会以 MultiSports 数据集为例，提供时空行为检测的完整步骤教程，包括\n",
+        "- 准备时空行为检测数据集\n",
+        "- 训练检测模型\n",
+        "- 准备 AVA 格式的数据集\n",
+        "- 训练时空行为检测模型\n"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "em5lgDTUxRAI"
+      },
+      "source": [
+        "## 0. 安装 MMAction2 和 MMDetection"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "bBM9DCrsxRAJ",
+        "outputId": "b310311f-f05e-4a5c-b6e5-8e6ee7e0dfae"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting openmim\n",
+            "  Downloading openmim-0.3.7-py2.py3-none-any.whl (51 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.3/51.3 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: Click in /usr/local/lib/python3.10/dist-packages (from openmim) (8.1.3)\n",
+            "Collecting colorama (from openmim)\n",
+            "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
+            "Collecting model-index (from openmim)\n",
+            "  Downloading model_index-0.1.11-py3-none-any.whl (34 kB)\n",
+            "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from openmim) (1.5.3)\n",
+            "Requirement already satisfied: pip>=19.3 in /usr/local/lib/python3.10/dist-packages (from openmim) (23.1.2)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from openmim) (2.27.1)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from openmim) (13.3.4)\n",
+            "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from openmim) (0.8.10)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (6.0)\n",
+            "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (3.4.3)\n",
+            "Collecting ordered-set (from model-index->openmim)\n",
+            "  Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)\n",
+            "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2.8.2)\n",
+            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2022.7.1)\n",
+            "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (1.22.4)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (1.26.15)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2022.12.7)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2.0.12)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (3.4)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.14.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->openmim) (0.1.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->openmim) (1.16.0)\n",
+            "Installing collected packages: ordered-set, colorama, model-index, openmim\n",
+            "Successfully installed colorama-0.4.6 model-index-0.1.11 openmim-0.3.7 ordered-set-4.1.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+            "Collecting mmengine\n",
+            "  Downloading mmengine-0.7.4-py3-none-any.whl (374 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m374.3/374.3 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting addict (from mmengine)\n",
+            "  Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine) (3.7.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmengine) (1.22.4)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmengine) (6.0)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine) (13.3.4)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine) (2.3.0)\n",
+            "Collecting yapf (from mmengine)\n",
+            "  Downloading yapf-0.40.0-py3-none-any.whl (250 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.3/250.3 kB\u001b[0m \u001b[31m29.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmengine) (4.7.0.72)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.4.4)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (23.1)\n",
+            "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (8.4.0)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (2.8.2)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.14.0)\n",
+            "Collecting importlib-metadata>=6.6.0 (from yapf->mmengine)\n",
+            "  Downloading importlib_metadata-6.6.0-py3-none-any.whl (22 kB)\n",
+            "Collecting platformdirs>=3.5.1 (from yapf->mmengine)\n",
+            "  Downloading platformdirs-3.5.3-py3-none-any.whl (15 kB)\n",
+            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmengine) (2.0.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmengine) (3.15.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine) (0.1.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine) (1.16.0)\n",
+            "Installing collected packages: addict, platformdirs, importlib-metadata, yapf, mmengine\n",
+            "  Attempting uninstall: platformdirs\n",
+            "    Found existing installation: platformdirs 3.3.0\n",
+            "    Uninstalling platformdirs-3.3.0:\n",
+            "      Successfully uninstalled platformdirs-3.3.0\n",
+            "Successfully installed addict-2.4.0 importlib-metadata-6.6.0 mmengine-0.7.4 platformdirs-3.5.3 yapf-0.40.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+            "Collecting mmcv\n",
+            "  Downloading https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/mmcv-2.0.0-cp310-cp310-manylinux1_x86_64.whl (74.4 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.4/74.4 MB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv) (2.4.0)\n",
+            "Requirement already satisfied: mmengine>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from mmcv) (0.7.4)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmcv) (1.22.4)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv) (23.1)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv) (8.4.0)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv) (6.0)\n",
+            "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv) (0.40.0)\n",
+            "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv) (4.7.0.72)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (3.7.1)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (13.3.4)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (2.3.0)\n",
+            "Requirement already satisfied: importlib-metadata>=6.6.0 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (6.6.0)\n",
+            "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (3.5.3)\n",
+            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (2.0.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv) (3.15.0)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (1.4.4)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (2.8.2)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv) (2.14.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine>=0.2.0->mmcv) (0.1.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine>=0.2.0->mmcv) (1.16.0)\n",
+            "Installing collected packages: mmcv\n",
+            "Successfully installed mmcv-2.0.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+            "Collecting mmdet\n",
+            "  Downloading mmdet-3.0.0-py3-none-any.whl (1.7 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m23.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmdet) (3.7.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.22.4)\n",
+            "Requirement already satisfied: pycocotools in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.6)\n",
+            "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.10.1)\n",
+            "Requirement already satisfied: shapely in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.1)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.16.0)\n",
+            "Collecting terminaltables (from mmdet)\n",
+            "  Downloading terminaltables-3.1.10-py2.py3-none-any.whl (15 kB)\n",
+            "Requirement already satisfied: mmcv<2.1.0,>=2.0.0rc4 in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.0)\n",
+            "Requirement already satisfied: mmengine<1.0.0,>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from mmdet) (0.7.4)\n",
+            "Requirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (2.4.0)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (23.1)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (8.4.0)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (6.0)\n",
+            "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (0.40.0)\n",
+            "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (4.7.0.72)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine<1.0.0,>=0.7.1->mmdet) (13.3.4)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine<1.0.0,>=0.7.1->mmdet) (2.3.0)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (1.4.4)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (2.8.2)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine<1.0.0,>=0.7.1->mmdet) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine<1.0.0,>=0.7.1->mmdet) (2.14.0)\n",
+            "Requirement already satisfied: importlib-metadata>=6.6.0 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (6.6.0)\n",
+            "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (3.5.3)\n",
+            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (2.0.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (3.15.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine<1.0.0,>=0.7.1->mmdet) (0.1.2)\n",
+            "Installing collected packages: terminaltables, mmdet\n",
+            "Successfully installed mmdet-3.0.0 terminaltables-3.1.10\n",
+            "Cloning into 'mmaction2'...\n",
+            "remote: Enumerating objects: 22869, done.\u001b[K\n",
+            "remote: Counting objects: 100% (1491/1491), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (801/801), done.\u001b[K\n",
+            "remote: Total 22869 (delta 854), reused 1171 (delta 685), pack-reused 21378\u001b[K\n",
+            "Receiving objects: 100% (22869/22869), 82.81 MiB | 27.92 MiB/s, done.\n",
+            "Resolving deltas: 100% (15952/15952), done.\n",
+            "/content/mmaction2\n",
+            "Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Obtaining file:///content/mmaction2\n",
+            "  Running command python setup.py egg_info\n",
+            "  running egg_info\n",
+            "  creating /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info\n",
+            "  writing /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/PKG-INFO\n",
+            "  writing dependency_links to /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/dependency_links.txt\n",
+            "  writing requirements to /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/requires.txt\n",
+            "  writing top-level names to /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/top_level.txt\n",
+            "  writing manifest file '/tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/SOURCES.txt'\n",
+            "  reading manifest file '/tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/SOURCES.txt'\n",
+            "  reading manifest template 'MANIFEST.in'\n",
+            "  warning: no files found matching 'mmaction/.mim/model-index.yml'\n",
+            "  warning: no files found matching '*.py' under directory 'mmaction/.mim/configs'\n",
+            "  warning: no files found matching '*.yml' under directory 'mmaction/.mim/configs'\n",
+            "  warning: no files found matching '*.sh' under directory 'mmaction/.mim/tools'\n",
+            "  warning: no files found matching '*.py' under directory 'mmaction/.mim/tools'\n",
+            "  adding license file 'LICENSE'\n",
+            "  writing manifest file '/tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/SOURCES.txt'\n",
+            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Collecting decord>=0.4.1 (from mmaction2==1.0.0)\n",
+            "  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.6/13.6 MB\u001b[0m \u001b[31m98.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting einops (from mmaction2==1.0.0)\n",
+            "  Downloading einops-0.6.1-py3-none-any.whl (42 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.2/42.2 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (3.7.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.22.4)\n",
+            "Requirement already satisfied: opencv-contrib-python in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (4.7.0.72)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (8.4.0)\n",
+            "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.10.1)\n",
+            "Requirement already satisfied: torch>=1.3 in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (2.0.1+cu118)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.12.0)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (4.5.0)\n",
+            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (1.11.1)\n",
+            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1)\n",
+            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1.2)\n",
+            "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (2.0.0)\n",
+            "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (3.25.2)\n",
+            "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (16.0.5)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.4.4)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (23.1)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (2.8.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmaction2==1.0.0) (1.16.0)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.3->mmaction2==1.0.0) (2.1.2)\n",
+            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.3->mmaction2==1.0.0) (1.3.0)\n",
+            "Installing collected packages: einops, decord, mmaction2\n",
+            "  Running setup.py develop for mmaction2\n",
+            "    Running command python setup.py develop\n",
+            "    running develop\n",
+            "    /usr/local/lib/python3.10/dist-packages/setuptools/command/develop.py:40: EasyInstallDeprecationWarning: easy_install command is deprecated.\n",
+            "    !!\n",
+            "\n",
+            "            ********************************************************************************\n",
+            "            Please avoid running ``setup.py`` and ``easy_install``.\n",
+            "            Instead, use pypa/build, pypa/installer, pypa/build or\n",
+            "            other standards-based tools.\n",
+            "\n",
+            "            See https://github.com/pypa/setuptools/issues/917 for details.\n",
+            "            ********************************************************************************\n",
+            "\n",
+            "    !!\n",
+            "      easy_install.initialize_options(self)\n",
+            "    /usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py:66: SetuptoolsDeprecationWarning: setup.py install is deprecated.\n",
+            "    !!\n",
+            "\n",
+            "            ********************************************************************************\n",
+            "            Please avoid running ``setup.py`` directly.\n",
+            "            Instead, use pypa/build, pypa/installer, pypa/build or\n",
+            "            other standards-based tools.\n",
+            "\n",
+            "            See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.\n",
+            "            ********************************************************************************\n",
+            "\n",
+            "    !!\n",
+            "      self.initialize_options()\n",
+            "    running egg_info\n",
+            "    creating mmaction2.egg-info\n",
+            "    writing mmaction2.egg-info/PKG-INFO\n",
+            "    writing dependency_links to mmaction2.egg-info/dependency_links.txt\n",
+            "    writing requirements to mmaction2.egg-info/requires.txt\n",
+            "    writing top-level names to mmaction2.egg-info/top_level.txt\n",
+            "    writing manifest file 'mmaction2.egg-info/SOURCES.txt'\n",
+            "    reading manifest file 'mmaction2.egg-info/SOURCES.txt'\n",
+            "    reading manifest template 'MANIFEST.in'\n",
+            "    adding license file 'LICENSE'\n",
+            "    writing manifest file 'mmaction2.egg-info/SOURCES.txt'\n",
+            "    running build_ext\n",
+            "    Creating /usr/local/lib/python3.10/dist-packages/mmaction2.egg-link (link to .)\n",
+            "    Adding mmaction2 1.0.0 to easy-install.pth file\n",
+            "\n",
+            "    Installed /content/mmaction2\n",
+            "Successfully installed decord-0.6.0 einops-0.6.1 mmaction2-1.0.0\n",
+            "/content/mmaction2/projects/stad_tutorial\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install -U openmim\n",
+        "!mim install mmengine\n",
+        "!mim install mmcv\n",
+        "!mim install mmdet\n",
+        "\n",
+        "!git clone https://github.com/open-mmlab/mmaction2.git\n",
+        "\n",
+        "%cd mmaction2\n",
+        "%pip install -v -e .\n",
+        "%cd projects/stad_tutorial"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4M1PQASJxRAM"
+      },
+      "source": [
+        "## 1. 准备时空行为检测数据集\n",
+        "\n",
+        "类似于检测任务需要提供检测框标注，时空行为检测任务需要对时间和空间同时定位，所以需要提供更复杂的 tube 标注。以 MultiSports 数据集的标注为例，`gttubes` 字段提供了视频中所有的目标动作标注，以下为一个标注片段：\n",
+        "\n",
+        "```\n",
+        "    'gttubes': {\n",
+        "        'aerobic_gymnastics/v_aqMgwPExjD0_c001': # video_key\n",
+        "            {\n",
+        "                10: # 类别标号\n",
+        "                    [\n",
+        "                        array([[ 377.,  904.,  316., 1016.,  584.], # 类别 10 的第 1 个 tube,\n",
+        "                               [ 378.,  882.,  315., 1016.,  579.], # shape (n, 5): 表示 n 帧，每帧标注中包括 (帧号，x1，y1, x2, y2)\n",
+        "                               ...\n",
+        "                               [ 398.,  861.,  304.,  954.,  549.]], dtype=float32)，\n",
+        "\n",
+        "                        array([[ 399.,  881.,  308.,  955.,  542.], # 类别 10 的第 2 个 tube\n",
+        "                               [ 400.,  862.,  303.,  988.,  539.],\n",
+        "                               [ 401.,  853.,  292., 1000.,  535.],\n",
+        "                               ...])\n",
+        "                        ...\n",
+        "\n",
+        "                    ] ,\n",
+        "                9: # 类别标号\n",
+        "                    [\n",
+        "                        array(...), # 类别 9 的第 1 个 tube\n",
+        "                        array(...), # 类别 9 的第 2 个 tube\n",
+        "                        ...\n",
+        "                    ]\n",
+        "                ...\n",
+        "            }\n",
+        "    }\n",
+        "```\n",
+        "\n",
+        "标注文件中还需要提供其他字段的信息，完整的真值文件包括以下信息：\n",
+        "```\n",
+        "{\n",
+        "    'labels':  # 标签列表\n",
+        "        ['aerobic push up', 'aerobic explosive push up', ...],\n",
+        "    'train_videos':  # 训练视频列表\n",
+        "        [\n",
+        "            [\n",
+        "                'aerobic_gymnastics/v_aqMgwPExjD0_c001',\n",
+        "                'aerobic_gymnastics/v_yaKOumdXwbU_c019',\n",
+        "                ...\n",
+        "            ]\n",
+        "        ]\n",
+        "    'test_videos':  # 测试视频列表\n",
+        "        [\n",
+        "            [\n",
+        "                'aerobic_gymnastics/v_crsi07chcV8_c004',\n",
+        "                'aerobic_gymnastics/v_dFYr67eNMwA_c005',\n",
+        "                ...\n",
+        "            ]\n",
+        "        ]\n",
+        "    'n_frames':  # dict 文件，提供各个视频的帧数信息\n",
+        "        {\n",
+        "            'aerobic_gymnastics/v_crsi07chcV8_c004': 725,\n",
+        "            'aerobic_gymnastics/v_dFYr67eNMwA_c005': 750,\n",
+        "            ...\n",
+        "        }\n",
+        "    'resolution':  # dict 文件，提供各个视频的分辨率信息\n",
+        "        {\n",
+        "            'aerobic_gymnastics/v_crsi07chcV8_c004': (720, 1280),\n",
+        "            'aerobic_gymnastics/v_dFYr67eNMwA_c005': (720, 1280),\n",
+        "            ...\n",
+        "        }\n",
+        "    'gt_tubes':  # dict 文件，提供 tube 的检测框信息\n",
+        "        {\n",
+        "            ... # 格式参考上述说明\n",
+        "        }\n",
+        "}\n",
+        "```\n",
+        "后续的实验基于 MultiSports-tiny 进行，我们从 MultiSports 中抽取了少量视频，用于演示整个流程。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "fiJPDuR9xRAQ",
+        "outputId": "8b3d8719-a9c0-4a59-d220-a3626fa34d3b"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "--2023-06-15 06:41:29--  https://download.openmmlab.com/mmaction/v1.0/projects/stad_tutorial/multisports-tiny.tar\n",
+            "Resolving download.openmmlab.com (download.openmmlab.com)... 8.48.85.214, 8.48.85.207, 8.48.85.208, ...\n",
+            "Connecting to download.openmmlab.com (download.openmmlab.com)|8.48.85.214|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 82780160 (79M) [application/x-tar]\n",
+            "Saving to: ‘data/multisports-tiny.tar’\n",
+            "\n",
+            "multisports-tiny.ta 100%[===================>]  78.95M  27.9MB/s    in 2.8s    \n",
+            "\n",
+            "2023-06-15 06:41:32 (27.9 MB/s) - ‘data/multisports-tiny.tar’ saved [82780160/82780160]\n",
+            "\n",
+            "multisports-tiny/multisports/\n",
+            "multisports-tiny/multisports/test/\n",
+            "multisports-tiny/multisports/test/aerobic_gymnastics/\n",
+            "multisports-tiny/multisports/test/aerobic_gymnastics/v_7G_IpU0FxLU_c001.mp4\n",
+            "multisports-tiny/multisports/annotations/\n",
+            "multisports-tiny/multisports/annotations/multisports_GT.pkl\n",
+            "multisports-tiny/multisports/trainval/\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c001.mp4\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c003.mp4\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c002.mp4\n",
+            "Reading package lists...\n",
+            "Building dependency tree...\n",
+            "Reading state information...\n",
+            "The following NEW packages will be installed:\n",
+            "  tree\n",
+            "0 upgraded, 1 newly installed, 0 to remove and 46 not upgraded.\n",
+            "Need to get 43.0 kB of archives.\n",
+            "After this operation, 115 kB of additional disk space will be used.\n",
+            "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tree amd64 1.8.0-1 [43.0 kB]\n",
+            "Fetched 43.0 kB in 0s (253 kB/s)\n",
+            "Selecting previously unselected package tree.\n",
+            "(Reading database ... 122541 files and directories currently installed.)\n",
+            "Preparing to unpack .../tree_1.8.0-1_amd64.deb ...\n",
+            "Unpacking tree (1.8.0-1) ...\n",
+            "Setting up tree (1.8.0-1) ...\n",
+            "Processing triggers for man-db (2.9.1-1) ...\n",
+            "\u001b[01;34mdata\u001b[00m\n",
+            "├── \u001b[01;34mmultisports\u001b[00m\n",
+            "│   ├── \u001b[01;34mannotations\u001b[00m\n",
+            "│   │   └── \u001b[01;32mmultisports_GT.pkl\u001b[00m\n",
+            "│   ├── \u001b[01;34mtest\u001b[00m\n",
+            "│   │   └── \u001b[01;34maerobic_gymnastics\u001b[00m\n",
+            "│   │       └── \u001b[01;32mv_7G_IpU0FxLU_c001.mp4\u001b[00m\n",
+            "│   └── \u001b[01;34mtrainval\u001b[00m\n",
+            "│       └── \u001b[01;34maerobic_gymnastics\u001b[00m\n",
+            "│           ├── \u001b[01;32mv__wAgwttPYaQ_c001.mp4\u001b[00m\n",
+            "│           ├── \u001b[01;32mv__wAgwttPYaQ_c002.mp4\u001b[00m\n",
+            "│           └── \u001b[01;32mv__wAgwttPYaQ_c003.mp4\u001b[00m\n",
+            "└── \u001b[01;31mmultisports-tiny.tar\u001b[00m\n",
+            "\n",
+            "6 directories, 6 files\n"
+          ]
+        }
+      ],
+      "source": [
+        "# 下载数据集\n",
+        "!wget -P data -c https://download.openmmlab.com/mmaction/v1.0/projects/stad_tutorial/multisports-tiny.tar\n",
+        "!tar -xvf data/multisports-tiny.tar --strip 1 -C data\n",
+        "!apt-get -q install tree\n",
+        "!tree data"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XjG0dEE8xRAS"
+      },
+      "source": [
+        "## 2. 训练检测模型\n",
+        "\n",
+        "在 SlowOnly + Det 的范式中，需要先训练人体检测器，再基于检测结果来预测行为。这一节中，我们基于上一节中的标注格式和 MMDetection 算法库训练检测模型。\n",
+        "\n",
+        "### 2.1 构建检测数据集标注（COCO 格式）\n",
+        "\n",
+        "基于时空行为检测数据集的标注信息，我们可以构建一个 COCO 格式的检测数据集，用于训练检测模型。我们提供了一个工具脚本对 MultiSports 格式的标注进行转换，如果需要基于其他格式转换，可以参考 MMDetection 提供的[自定义数据集](https://mmdetection.readthedocs.io/zh_CN/latest/advanced_guides/customize_dataset.html)文档。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "inBtClMIxRAV",
+        "outputId": "3ac5199b-562f-48c4-da27-819d34069213"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\u001b[01;34mdata/multisports/annotations\u001b[00m\n",
+            "├── multisports_det_anno_train.json\n",
+            "├── multisports_det_anno_val.json\n",
+            "└── \u001b[01;32mmultisports_GT.pkl\u001b[00m\n",
+            "\n",
+            "0 directories, 3 files\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python tools/generate_mmdet_anno.py data/multisports/annotations/multisports_GT.pkl data/multisports/annotations/multisports_det_anno.json\n",
+        "!tree data/multisports/annotations"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "TkPONRezxRAZ",
+        "outputId": "0f8075a1-47fb-490d-9c88-4904f45363fb"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Will generate 3 rgb dir for aerobic_gymnastics.\n",
+            "Generate v__wAgwttPYaQ_c003 rgb dir successfully.\n",
+            "Generate v__wAgwttPYaQ_c002 rgb dir successfully.\n",
+            "Generate v__wAgwttPYaQ_c001 rgb dir successfully.\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python tools/generate_rgb.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MP-umqqnxRAa"
+      },
+      "source": [
+        "### 2.2 修改 config 文件\n",
+        "\n",
+        "我们以 faster-rcnn_x101-64x4d_fpn_1x_coco 为基础配置，做如下修改，在 MultiSports 数据集上进行训练。需要修改以下几个部分：\n",
+        "- 模型的类别数量\n",
+        "- 学习率调整策略\n",
+        "- 优化器配置\n",
+        "- 数据集/标注文件路径\n",
+        "- 评测器配置\n",
+        "- 预训练模型\n",
+        "\n",
+        "更详细的教程可以参考 MMDetection 提供的[准备配置文件](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/train.html#id9)文档。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "yMw9MrI0xRAc",
+        "outputId": "1f5ee99a-d4cb-45b0-df71-f0209a9b6275"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "# Copyright (c) OpenMMLab. All rights reserved.\n",
+            "_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py'\n",
+            "model = dict(roi_head=dict(bbox_head=dict(num_classes=1)))\n",
+            "\n",
+            "# take 2 epochs as an example\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n",
+            "\n",
+            "# learning rate\n",
+            "param_scheduler = [\n",
+            "    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n",
+            "]\n",
+            "\n",
+            "# optimizer\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.0050, momentum=0.9, weight_decay=0.0001))\n",
+            "\n",
+            "dataset_type = 'CocoDataset'\n",
+            "# modify metainfo\n",
+            "metainfo = {\n",
+            "    'classes': ('person', ),\n",
+            "    'palette': [\n",
+            "        (220, 20, 60),\n",
+            "    ]\n",
+            "}\n",
+            "\n",
+            "# specify metainfo, dataset path\n",
+            "data_root = 'data/multisports/'\n",
+            "\n",
+            "train_dataloader = dict(\n",
+            "    dataset=dict(\n",
+            "        data_root=data_root,\n",
+            "        ann_file='annotations/multisports_det_anno_train.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        metainfo=metainfo))\n",
+            "\n",
+            "val_dataloader = dict(\n",
+            "    dataset=dict(\n",
+            "        data_root=data_root,\n",
+            "        ann_file='annotations/multisports_det_anno_val.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        metainfo=metainfo))\n",
+            "\n",
+            "test_dataloader = dict(\n",
+            "    dataset=dict(\n",
+            "        data_root=data_root,\n",
+            "        ann_file='annotations/ms_infer_anno.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        metainfo=metainfo))\n",
+            "\n",
+            "# specify annotaition file path, modify metric items\n",
+            "val_evaluator = dict(\n",
+            "    ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5],\n",
+            ")\n",
+            "\n",
+            "test_evaluator = dict(\n",
+            "    ann_file='data/multisports/annotations/ms_infer_anno.json',\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5],\n",
+            ")\n",
+            "\n",
+            "# specify pretrain checkpoint\n",
+            "load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501\n"
+          ]
+        }
+      ],
+      "source": [
+        "!cat configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "S3Ux8echxRAe"
+      },
+      "source": [
+        "### 2.3 训练检测模型"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MYtjYFU5xRAf"
+      },
+      "source": [
+        "利用 MIM 可以在当前路径直接训练 MMDetection 模型，这里提供最简单的单卡训练示例，更多训练命令可以参考 MIM [教程](https://github.com/open-mmlab/mim#command)。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "56m--2T8xRAg",
+        "outputId": "d47ceca0-e930-4063-e25d-739a44410b86"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training command is /usr/bin/python3 /usr/local/lib/python3.10/dist-packages/mmdet/.mim/tools/train.py configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py --launcher none --work-dir work_dirs/det_model. \n",
+            "06/15 06:42:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 1318688827\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "    PyTorch: 2.0.1+cu118\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 9.3\n",
+            "  - C++ Version: 201703\n",
+            "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.8\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+            "  - CuDNN 8.7\n",
+            "  - Magma 2.6.1\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+            "\n",
+            "    TorchVision: 0.15.2+cu118\n",
+            "    OpenCV: 4.7.0\n",
+            "    MMEngine: 0.7.4\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 1318688827\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "06/15 06:42:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+            "model = dict(\n",
+            "    type='FasterRCNN',\n",
+            "    data_preprocessor=dict(\n",
+            "        type='DetDataPreprocessor',\n",
+            "        mean=[103.53, 116.28, 123.675],\n",
+            "        std=[1.0, 1.0, 1.0],\n",
+            "        bgr_to_rgb=False,\n",
+            "        pad_size_divisor=32),\n",
+            "    backbone=dict(\n",
+            "        type='ResNet',\n",
+            "        depth=50,\n",
+            "        num_stages=4,\n",
+            "        out_indices=(0, 1, 2, 3),\n",
+            "        frozen_stages=1,\n",
+            "        norm_cfg=dict(type='BN', requires_grad=False),\n",
+            "        norm_eval=True,\n",
+            "        style='caffe',\n",
+            "        init_cfg=dict(\n",
+            "            type='Pretrained',\n",
+            "            checkpoint='open-mmlab://detectron2/resnet50_caffe')),\n",
+            "    neck=dict(\n",
+            "        type='FPN',\n",
+            "        in_channels=[256, 512, 1024, 2048],\n",
+            "        out_channels=256,\n",
+            "        num_outs=5),\n",
+            "    rpn_head=dict(\n",
+            "        type='RPNHead',\n",
+            "        in_channels=256,\n",
+            "        feat_channels=256,\n",
+            "        anchor_generator=dict(\n",
+            "            type='AnchorGenerator',\n",
+            "            scales=[8],\n",
+            "            ratios=[0.5, 1.0, 2.0],\n",
+            "            strides=[4, 8, 16, 32, 64]),\n",
+            "        bbox_coder=dict(\n",
+            "            type='DeltaXYWHBBoxCoder',\n",
+            "            target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "            target_stds=[1.0, 1.0, 1.0, 1.0]),\n",
+            "        loss_cls=dict(\n",
+            "            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n",
+            "        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),\n",
+            "    roi_head=dict(\n",
+            "        type='StandardRoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor',\n",
+            "            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n",
+            "            out_channels=256,\n",
+            "            featmap_strides=[4, 8, 16, 32]),\n",
+            "        bbox_head=dict(\n",
+            "            type='Shared2FCBBoxHead',\n",
+            "            in_channels=256,\n",
+            "            fc_out_channels=1024,\n",
+            "            roi_feat_size=7,\n",
+            "            num_classes=1,\n",
+            "            bbox_coder=dict(\n",
+            "                type='DeltaXYWHBBoxCoder',\n",
+            "                target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "                target_stds=[0.1, 0.1, 0.2, 0.2]),\n",
+            "            reg_class_agnostic=False,\n",
+            "            loss_cls=dict(\n",
+            "                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n",
+            "            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),\n",
+            "    train_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.7,\n",
+            "                neg_iou_thr=0.3,\n",
+            "                min_pos_iou=0.3,\n",
+            "                match_low_quality=True,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=256,\n",
+            "                pos_fraction=0.5,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=False),\n",
+            "            allowed_border=-1,\n",
+            "            pos_weight=-1,\n",
+            "            debug=False),\n",
+            "        rpn_proposal=dict(\n",
+            "            nms_pre=2000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.5,\n",
+            "                neg_iou_thr=0.5,\n",
+            "                min_pos_iou=0.5,\n",
+            "                match_low_quality=False,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=512,\n",
+            "                pos_fraction=0.25,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=-1,\n",
+            "            debug=False)),\n",
+            "    test_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            nms_pre=1000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            score_thr=0.05,\n",
+            "            nms=dict(type='nms', iou_threshold=0.5),\n",
+            "            max_per_img=100)))\n",
+            "dataset_type = 'CocoDataset'\n",
+            "data_root = 'data/multisports/'\n",
+            "backend_args = None\n",
+            "train_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='RandomChoiceResize',\n",
+            "        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                (1333, 768), (1333, 800)],\n",
+            "        keep_ratio=True),\n",
+            "    dict(type='RandomFlip', prob=0.5),\n",
+            "    dict(type='PackDetInputs')\n",
+            "]\n",
+            "test_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='PackDetInputs',\n",
+            "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                   'scale_factor'))\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+            "    batch_sampler=dict(type='AspectRatioBatchSampler'),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_train.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        filter_cfg=dict(filter_empty_gt=True, min_size=32),\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='RandomChoiceResize',\n",
+            "                scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                        (1333, 768), (1333, 800)],\n",
+            "                keep_ratio=True),\n",
+            "            dict(type='RandomFlip', prob=0.5),\n",
+            "            dict(type='PackDetInputs')\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_val.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/ms_infer_anno.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "test_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/ms_infer_anno.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n",
+            "val_cfg = dict(type='ValLoop')\n",
+            "test_cfg = dict(type='TestLoop')\n",
+            "param_scheduler = [\n",
+            "    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))\n",
+            "auto_scale_lr = dict(enable=False, base_batch_size=16)\n",
+            "default_scope = 'mmdet'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='DetVisualizationHook'))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='DetLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='visualizer')\n",
+            "log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)\n",
+            "log_level = 'INFO'\n",
+            "load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'\n",
+            "resume = False\n",
+            "metainfo = dict(classes=('person', ), palette=[(220, 20, 60)])\n",
+            "launcher = 'none'\n",
+            "work_dir = 'work_dirs/det_model'\n",
+            "\n",
+            "06/15 06:42:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "06/15 06:42:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+            "before_run:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "before_train:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_train_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DistSamplerSeedHook                \n",
+            " -------------------- \n",
+            "before_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_val_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_val_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train:\n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_test_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_test_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_run:\n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "06/15 06:42:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - load model from: open-mmlab://detectron2/resnet50_caffe\n",
+            "06/15 06:42:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Loads checkpoint by openmmlab backend from path: open-mmlab://detectron2/resnet50_caffe\n",
+            "Downloading: \"https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth\" to /root/.cache/torch/hub/checkpoints/resnet50_msra-5891d200.pth\n",
+            "100% 89.9M/89.9M [00:03<00:00, 31.4MB/s]\n",
+            "06/15 06:42:53 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
+            "\n",
+            "unexpected key in source state_dict: conv1.bias\n",
+            "\n",
+            "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n",
+            "Downloading: \"https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\" to /root/.cache/torch/hub/checkpoints/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n",
+            "100% 158M/158M [00:06<00:00, 24.4MB/s]\n",
+            "06/15 06:43:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n",
+            "06/15 06:43:00 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n",
+            "06/15 06:43:00 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n",
+            "06/15 06:43:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Checkpoints will be saved to /content/mmaction2/projects/stad_tutorial/work_dirs/det_model.\n",
+            "06/15 06:43:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 50/118]  lr: 5.0000e-03  eta: 0:02:00  time: 0.6468  data_time: 0.0127  memory: 3419  loss: 0.4823  loss_rpn_cls: 0.0063  loss_rpn_bbox: 0.0151  loss_cls: 0.1676  acc: 95.0195  loss_bbox: 0.2933\n",
+            "06/15 06:43:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][100/118]  lr: 5.0000e-03  eta: 0:01:17  time: 0.4922  data_time: 0.0077  memory: 3419  loss: 0.4234  loss_rpn_cls: 0.0031  loss_rpn_bbox: 0.0134  loss_cls: 0.1394  acc: 91.9922  loss_bbox: 0.2676\n",
+            "06/15 06:44:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person_20230615_064239\n",
+            "06/15 06:44:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 1 epochs\n",
+            "06/15 06:44:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 50/120]    eta: 0:00:08  time: 0.1269  data_time: 0.0112  memory: 3419  \n",
+            "06/15 06:44:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][100/120]    eta: 0:00:02  time: 0.1159  data_time: 0.0032  memory: 682  \n",
+            "06/15 06:44:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n",
+            "Loading and preparing results...\n",
+            "DONE (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "Running per image evaluation...\n",
+            "Evaluate annotation type *bbox*\n",
+            "DONE (t=0.04s).\n",
+            "Accumulating evaluation results...\n",
+            "DONE (t=0.01s).\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.913\n",
+            " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 0.817\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.908\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.960\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=300 ] = 0.960\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=1000 ] = 0.960\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.960\n",
+            "06/15 06:44:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: 0.913 -1.000 -1.000 -1.000 0.817 0.908\n",
+            "06/15 06:44:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120]    coco/bbox_mAP_50: -1.0000  coco/bbox_AR@100: 0.9600  data_time: 0.0065  time: 0.1205\n",
+            "06/15 06:44:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 50/118]  lr: 5.0000e-03  eta: 0:00:37  time: 0.5233  data_time: 0.0099  memory: 3419  loss: 0.3250  loss_rpn_cls: 0.0025  loss_rpn_bbox: 0.0107  loss_cls: 0.1116  acc: 95.2148  loss_bbox: 0.2002\n",
+            "06/15 06:45:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][100/118]  lr: 5.0000e-03  eta: 0:00:09  time: 0.5354  data_time: 0.0083  memory: 3419  loss: 0.3042  loss_rpn_cls: 0.0013  loss_rpn_bbox: 0.0105  loss_cls: 0.0946  acc: 94.9219  loss_bbox: 0.1978\n",
+            "06/15 06:45:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person_20230615_064239\n",
+            "06/15 06:45:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 2 epochs\n",
+            "06/15 06:45:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 50/120]    eta: 0:00:08  time: 0.1237  data_time: 0.0050  memory: 3419  \n",
+            "06/15 06:45:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][100/120]    eta: 0:00:02  time: 0.1225  data_time: 0.0058  memory: 682  \n",
+            "06/15 06:45:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n",
+            "Loading and preparing results...\n",
+            "DONE (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "Running per image evaluation...\n",
+            "Evaluate annotation type *bbox*\n",
+            "DONE (t=0.07s).\n",
+            "Accumulating evaluation results...\n",
+            "DONE (t=0.01s).\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.912\n",
+            " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 0.747\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.916\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.955\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=300 ] = 0.955\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=1000 ] = 0.955\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.954\n",
+            "06/15 06:45:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: 0.912 -1.000 -1.000 -1.000 0.747 0.916\n",
+            "06/15 06:45:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120]    coco/bbox_mAP_50: -1.0000  coco/bbox_AR@100: 0.9550  data_time: 0.0052  time: 0.1228\n",
+            "\u001b[32mTraining finished successfully. \u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "!mim train mmdet configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n",
+        "    --work-dir work_dirs/det_model"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-pf9MnuUxRAh"
+      },
+      "source": [
+        "### 2.4 生成 proposal bboxes\n",
+        "\n",
+        "在时空行为检测模型训练时，需要基于检测模型推理得到的 proposal，而不能基于标注的检测框。因此我们需要利用训练好的检测模型对整个数据集进行推理，得到 proposal 后转换为需要的格式，用于后续训练。\n",
+        "\n",
+        "#### 2.4.1 将数据集转换为 Coco 格式\n",
+        "\n",
+        "我们提供了脚本将 MultiSports 数据集转换成没有真值的标注格式，用于推理。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "nL2n0AKJxRAi",
+        "outputId": "51907af1-7da3-4713-8e90-a61b052000aa"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[>>] 2350/2350, 1606.7 task/s, elapsed: 1s, ETA:     0s\n",
+            "save json file: data/multisports/rawframes/../annotations/ms_infer_anno.json\n"
+          ]
+        }
+      ],
+      "source": [
+        "!echo 'person' > data/multisports/annotations/label_map.txt\n",
+        "!python tools/images2coco.py \\\n",
+        "        data/multisports/rawframes \\\n",
+        "        data/multisports/annotations/label_map.txt \\\n",
+        "        ms_infer_anno.json"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_REQniysxRAj"
+      },
+      "source": [
+        "#### 2.4.2 推理生成 proposal file"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ShnTsjs1xRAk"
+      },
+      "source": [
+        "MMDetection 模型的推理同样基于 MIM，更多测试命令请参考 MIM [教程](https://github.com/open-mmlab/mim#command)。\n",
+        "\n",
+        "推理完成后，会将推理结果保存在 'data/multisports/ms_proposals.pkl'。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "DXnT4aArxRAm",
+        "outputId": "565faf02-4b7f-49ab-f30f-b20e7eb09085"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Testing command is /usr/bin/python3 /usr/local/lib/python3.10/dist-packages/mmdet/.mim/tools/test.py configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py work_dirs/det_model/epoch_2.pth --launcher none --out data/multisports/annotations/ms_det_proposals.pkl. \n",
+            "06/15 06:45:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 1403639615\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "    PyTorch: 2.0.1+cu118\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 9.3\n",
+            "  - C++ Version: 201703\n",
+            "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.8\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+            "  - CuDNN 8.7\n",
+            "  - Magma 2.6.1\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+            "\n",
+            "    TorchVision: 0.15.2+cu118\n",
+            "    OpenCV: 4.7.0\n",
+            "    MMEngine: 0.7.4\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 1403639615\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "06/15 06:45:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+            "model = dict(\n",
+            "    type='FasterRCNN',\n",
+            "    data_preprocessor=dict(\n",
+            "        type='DetDataPreprocessor',\n",
+            "        mean=[103.53, 116.28, 123.675],\n",
+            "        std=[1.0, 1.0, 1.0],\n",
+            "        bgr_to_rgb=False,\n",
+            "        pad_size_divisor=32),\n",
+            "    backbone=dict(\n",
+            "        type='ResNet',\n",
+            "        depth=50,\n",
+            "        num_stages=4,\n",
+            "        out_indices=(0, 1, 2, 3),\n",
+            "        frozen_stages=1,\n",
+            "        norm_cfg=dict(type='BN', requires_grad=False),\n",
+            "        norm_eval=True,\n",
+            "        style='caffe',\n",
+            "        init_cfg=dict(\n",
+            "            type='Pretrained',\n",
+            "            checkpoint='open-mmlab://detectron2/resnet50_caffe')),\n",
+            "    neck=dict(\n",
+            "        type='FPN',\n",
+            "        in_channels=[256, 512, 1024, 2048],\n",
+            "        out_channels=256,\n",
+            "        num_outs=5),\n",
+            "    rpn_head=dict(\n",
+            "        type='RPNHead',\n",
+            "        in_channels=256,\n",
+            "        feat_channels=256,\n",
+            "        anchor_generator=dict(\n",
+            "            type='AnchorGenerator',\n",
+            "            scales=[8],\n",
+            "            ratios=[0.5, 1.0, 2.0],\n",
+            "            strides=[4, 8, 16, 32, 64]),\n",
+            "        bbox_coder=dict(\n",
+            "            type='DeltaXYWHBBoxCoder',\n",
+            "            target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "            target_stds=[1.0, 1.0, 1.0, 1.0]),\n",
+            "        loss_cls=dict(\n",
+            "            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n",
+            "        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),\n",
+            "    roi_head=dict(\n",
+            "        type='StandardRoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor',\n",
+            "            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n",
+            "            out_channels=256,\n",
+            "            featmap_strides=[4, 8, 16, 32]),\n",
+            "        bbox_head=dict(\n",
+            "            type='Shared2FCBBoxHead',\n",
+            "            in_channels=256,\n",
+            "            fc_out_channels=1024,\n",
+            "            roi_feat_size=7,\n",
+            "            num_classes=1,\n",
+            "            bbox_coder=dict(\n",
+            "                type='DeltaXYWHBBoxCoder',\n",
+            "                target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "                target_stds=[0.1, 0.1, 0.2, 0.2]),\n",
+            "            reg_class_agnostic=False,\n",
+            "            loss_cls=dict(\n",
+            "                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n",
+            "            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),\n",
+            "    train_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.7,\n",
+            "                neg_iou_thr=0.3,\n",
+            "                min_pos_iou=0.3,\n",
+            "                match_low_quality=True,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=256,\n",
+            "                pos_fraction=0.5,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=False),\n",
+            "            allowed_border=-1,\n",
+            "            pos_weight=-1,\n",
+            "            debug=False),\n",
+            "        rpn_proposal=dict(\n",
+            "            nms_pre=2000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.5,\n",
+            "                neg_iou_thr=0.5,\n",
+            "                min_pos_iou=0.5,\n",
+            "                match_low_quality=False,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=512,\n",
+            "                pos_fraction=0.25,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=-1,\n",
+            "            debug=False)),\n",
+            "    test_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            nms_pre=1000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            score_thr=0.05,\n",
+            "            nms=dict(type='nms', iou_threshold=0.5),\n",
+            "            max_per_img=100)))\n",
+            "dataset_type = 'CocoDataset'\n",
+            "data_root = 'data/multisports/'\n",
+            "backend_args = None\n",
+            "train_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='RandomChoiceResize',\n",
+            "        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                (1333, 768), (1333, 800)],\n",
+            "        keep_ratio=True),\n",
+            "    dict(type='RandomFlip', prob=0.5),\n",
+            "    dict(type='PackDetInputs')\n",
+            "]\n",
+            "test_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='PackDetInputs',\n",
+            "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                   'scale_factor'))\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+            "    batch_sampler=dict(type='AspectRatioBatchSampler'),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_train.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        filter_cfg=dict(filter_empty_gt=True, min_size=32),\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='RandomChoiceResize',\n",
+            "                scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                        (1333, 768), (1333, 800)],\n",
+            "                keep_ratio=True),\n",
+            "            dict(type='RandomFlip', prob=0.5),\n",
+            "            dict(type='PackDetInputs')\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_val.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/ms_infer_anno.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "test_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/ms_infer_anno.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n",
+            "val_cfg = dict(type='ValLoop')\n",
+            "test_cfg = dict(type='TestLoop')\n",
+            "param_scheduler = [\n",
+            "    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))\n",
+            "auto_scale_lr = dict(enable=False, base_batch_size=16)\n",
+            "default_scope = 'mmdet'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='DetVisualizationHook'))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='DetLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='visualizer')\n",
+            "log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)\n",
+            "log_level = 'INFO'\n",
+            "load_from = 'work_dirs/det_model/epoch_2.pth'\n",
+            "resume = False\n",
+            "metainfo = dict(classes=('person', ), palette=[(220, 20, 60)])\n",
+            "launcher = 'none'\n",
+            "work_dir = './work_dirs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person'\n",
+            "\n",
+            "06/15 06:45:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "06/15 06:45:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+            "before_run:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "before_train:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_train_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DistSamplerSeedHook                \n",
+            " -------------------- \n",
+            "before_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_val_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_val_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train:\n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_test_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_test_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_run:\n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "06/15 06:45:56 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The prefix is not set in metric class DumpDetResults.\n",
+            "Loads checkpoint by local backend from path: work_dirs/det_model/epoch_2.pth\n",
+            "06/15 06:45:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from work_dirs/det_model/epoch_2.pth\n",
+            "06/15 06:46:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [  50/2350]    eta: 0:05:46  time: 0.1507  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:46:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 100/2350]    eta: 0:05:06  time: 0.1217  data_time: 0.0059  memory: 512  \n",
+            "06/15 06:46:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 150/2350]    eta: 0:04:47  time: 0.1193  data_time: 0.0022  memory: 512  \n",
+            "06/15 06:46:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 200/2350]    eta: 0:04:34  time: 0.1197  data_time: 0.0023  memory: 512  \n",
+            "06/15 06:46:29 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 250/2350]    eta: 0:04:27  time: 0.1258  data_time: 0.0073  memory: 512  \n",
+            "06/15 06:46:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 300/2350]    eta: 0:04:19  time: 0.1215  data_time: 0.0026  memory: 512  \n",
+            "06/15 06:46:41 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 350/2350]    eta: 0:04:12  time: 0.1242  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:46:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 400/2350]    eta: 0:04:04  time: 0.1218  data_time: 0.0029  memory: 512  \n",
+            "06/15 06:46:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 450/2350]    eta: 0:03:58  time: 0.1229  data_time: 0.0042  memory: 512  \n",
+            "06/15 06:46:59 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 500/2350]    eta: 0:03:51  time: 0.1229  data_time: 0.0048  memory: 512  \n",
+            "06/15 06:47:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 550/2350]    eta: 0:03:44  time: 0.1193  data_time: 0.0020  memory: 512  \n",
+            "06/15 06:47:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 600/2350]    eta: 0:03:37  time: 0.1234  data_time: 0.0060  memory: 512  \n",
+            "06/15 06:47:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 650/2350]    eta: 0:03:30  time: 0.1184  data_time: 0.0025  memory: 512  \n",
+            "06/15 06:47:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 700/2350]    eta: 0:03:24  time: 0.1200  data_time: 0.0041  memory: 512  \n",
+            "06/15 06:47:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 750/2350]    eta: 0:03:17  time: 0.1216  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:47:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 800/2350]    eta: 0:03:11  time: 0.1184  data_time: 0.0024  memory: 512  \n",
+            "06/15 06:47:42 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 850/2350]    eta: 0:03:04  time: 0.1234  data_time: 0.0064  memory: 512  \n",
+            "06/15 06:47:48 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 900/2350]    eta: 0:02:58  time: 0.1196  data_time: 0.0028  memory: 512  \n",
+            "06/15 06:47:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 950/2350]    eta: 0:02:52  time: 0.1217  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:48:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1000/2350]    eta: 0:02:45  time: 0.1220  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:48:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1050/2350]    eta: 0:02:39  time: 0.1203  data_time: 0.0028  memory: 512  \n",
+            "06/15 06:48:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1100/2350]    eta: 0:02:33  time: 0.1231  data_time: 0.0055  memory: 512  \n",
+            "06/15 06:48:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1150/2350]    eta: 0:02:27  time: 0.1207  data_time: 0.0033  memory: 512  \n",
+            "06/15 06:48:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1200/2350]    eta: 0:02:21  time: 0.1217  data_time: 0.0049  memory: 512  \n",
+            "06/15 06:48:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1250/2350]    eta: 0:02:14  time: 0.1211  data_time: 0.0038  memory: 512  \n",
+            "06/15 06:48:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1300/2350]    eta: 0:02:08  time: 0.1242  data_time: 0.0070  memory: 512  \n",
+            "06/15 06:48:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1350/2350]    eta: 0:02:02  time: 0.1249  data_time: 0.0077  memory: 512  \n",
+            "06/15 06:48:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1400/2350]    eta: 0:01:56  time: 0.1181  data_time: 0.0022  memory: 512  \n",
+            "06/15 06:48:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1450/2350]    eta: 0:01:50  time: 0.1219  data_time: 0.0055  memory: 512  \n",
+            "06/15 06:49:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1500/2350]    eta: 0:01:44  time: 0.1198  data_time: 0.0034  memory: 512  \n",
+            "06/15 06:49:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1550/2350]    eta: 0:01:37  time: 0.1194  data_time: 0.0028  memory: 512  \n",
+            "06/15 06:49:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1600/2350]    eta: 0:01:31  time: 0.1228  data_time: 0.0059  memory: 512  \n",
+            "06/15 06:49:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1650/2350]    eta: 0:01:25  time: 0.1193  data_time: 0.0026  memory: 512  \n",
+            "06/15 06:49:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1700/2350]    eta: 0:01:19  time: 0.1232  data_time: 0.0060  memory: 512  \n",
+            "06/15 06:49:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1750/2350]    eta: 0:01:13  time: 0.1199  data_time: 0.0028  memory: 512  \n",
+            "06/15 06:49:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1800/2350]    eta: 0:01:07  time: 0.1205  data_time: 0.0035  memory: 512  \n",
+            "06/15 06:49:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1850/2350]    eta: 0:01:01  time: 0.1237  data_time: 0.0067  memory: 512  \n",
+            "06/15 06:49:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1900/2350]    eta: 0:00:54  time: 0.1190  data_time: 0.0024  memory: 512  \n",
+            "06/15 06:49:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1950/2350]    eta: 0:00:48  time: 0.1238  data_time: 0.0069  memory: 512  \n",
+            "06/15 06:50:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2000/2350]    eta: 0:00:42  time: 0.1183  data_time: 0.0020  memory: 512  \n",
+            "06/15 06:50:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2050/2350]    eta: 0:00:36  time: 0.1212  data_time: 0.0049  memory: 512  \n",
+            "06/15 06:50:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2100/2350]    eta: 0:00:30  time: 0.1212  data_time: 0.0044  memory: 512  \n",
+            "06/15 06:50:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2150/2350]    eta: 0:00:24  time: 0.1180  data_time: 0.0019  memory: 512  \n",
+            "06/15 06:50:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2200/2350]    eta: 0:00:18  time: 0.1233  data_time: 0.0062  memory: 512  \n",
+            "06/15 06:50:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2250/2350]    eta: 0:00:12  time: 0.1186  data_time: 0.0021  memory: 512  \n",
+            "06/15 06:50:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2300/2350]    eta: 0:00:06  time: 0.1227  data_time: 0.0064  memory: 512  \n",
+            "06/15 06:50:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2350/2350]    eta: 0:00:00  time: 0.1196  data_time: 0.0033  memory: 512  \n",
+            "06/15 06:50:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n",
+            "Loading and preparing results...\n",
+            "DONE (t=0.01s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "Running per image evaluation...\n",
+            "Evaluate annotation type *bbox*\n",
+            "DONE (t=0.37s).\n",
+            "Accumulating evaluation results...\n",
+            "DONE (t=0.28s).\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=300 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = -1.000\n",
+            "06/15 06:50:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: -1.000 -1.000 -1.000 -1.000 -1.000 -1.000\n",
+            "06/15 06:50:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Results has been saved to data/multisports/annotations/ms_det_proposals.pkl.\n",
+            "06/15 06:50:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2350/2350]    coco/bbox_mAP_50: -1.0000  coco/bbox_AR@100: -1.0000  data_time: 0.0042  time: 0.1219\n",
+            "\u001b[32mTesting finished successfully.\u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "!mim test mmdet configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n",
+        "    --checkpoint work_dirs/det_model/epoch_2.pth \\\n",
+        "    --out data/multisports/annotations/ms_det_proposals.pkl"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "1zErF-nsxRAo"
+      },
+      "source": [
+        "## 3. 训练时空行为检测模型\n",
+        "\n",
+        "### 3.1 转换标注文件以及 proposal 文件\n",
+        "\n",
+        "MultiSports 数据集提供的标注文件，以及 MMDetection 推理生成的 proposal 都需要进行格式转换，才能用于时空行为检测模型的训练。我们已经提供了相关的脚本工具，执行后即可生成指定格式"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "naAfcO4QxRAo",
+        "outputId": "2a309bef-241f-44fc-8276-b2ea4735e37d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "loading test result...\n",
+            "[>>] 2350/2350, 3582.6 task/s, elapsed: 1s, ETA:     0s\n",
+            "\u001b[01;34mdata/multisports/annotations\u001b[00m\n",
+            "├── label_map.txt\n",
+            "├── ms_det_proposals.pkl\n",
+            "├── ms_infer_anno.json\n",
+            "├── multisports_det_anno_train.json\n",
+            "├── multisports_det_anno_val.json\n",
+            "├── \u001b[01;32mmultisports_GT.pkl\u001b[00m\n",
+            "├── multisports_proposals_train.pkl\n",
+            "├── multisports_proposals_val.pkl\n",
+            "├── multisports_train.csv\n",
+            "└── multisports_val.csv\n",
+            "\n",
+            "0 directories, 10 files\n"
+          ]
+        }
+      ],
+      "source": [
+        "# 转换 anno 文件\n",
+        "!python ../../tools/data/multisports/parse_anno.py\n",
+        "\n",
+        "# 转换 proposal 文件\n",
+        "!python tools/convert_proposals.py\n",
+        "\n",
+        "!tree data/multisports/annotations"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "balpcJEbxRAp"
+      },
+      "source": [
+        "### 3.2 训练时空行为检测模型\n",
+        "\n",
+        "MMAction2 中已经支持训练 MultiSports 数据集，这里只需要修改 proposal 文件的路径即可, 详细配置可以参考 [config](configs/slowonly_k400_multisports.py) 文件。由于训练数据较少，配置中将在完整 MultiSports 数据集上训练得到的模型作为预训练模型，使用自定义数据集训练时不需要指定 `load_from` 配置。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "cIuQTmnuxRAq",
+        "outputId": "253d7f08-3c89-4e31-c5f4-3880aed5d817"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training command is /usr/bin/python3 /content/mmaction2/mmaction/.mim/tools/train.py configs/slowonly_k400_multisports.py --launcher none --work-dir work_dirs/stad_model/. \n",
+            "06/15 06:50:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 546414243\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "    PyTorch: 2.0.1+cu118\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 9.3\n",
+            "  - C++ Version: 201703\n",
+            "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.8\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+            "  - CuDNN 8.7\n",
+            "  - Magma 2.6.1\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+            "\n",
+            "    TorchVision: 0.15.2+cu118\n",
+            "    OpenCV: 4.7.0\n",
+            "    MMEngine: 0.7.4\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 546414243\n",
+            "    diff_rank_seed: False\n",
+            "    deterministic: False\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "06/15 06:50:59 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+            "default_scope = 'mmaction'\n",
+            "default_hooks = dict(\n",
+            "    runtime_info=dict(type='RuntimeInfoHook', _scope_='mmaction'),\n",
+            "    timer=dict(type='IterTimerHook', _scope_='mmaction'),\n",
+            "    logger=dict(\n",
+            "        type='LoggerHook', interval=20, ignore_last=False, _scope_='mmaction'),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook', _scope_='mmaction'),\n",
+            "    checkpoint=dict(\n",
+            "        type='CheckpointHook',\n",
+            "        interval=1,\n",
+            "        save_best='auto',\n",
+            "        _scope_='mmaction'),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook', _scope_='mmaction'),\n",
+            "    sync_buffers=dict(type='SyncBuffersHook', _scope_='mmaction'))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "log_processor = dict(\n",
+            "    type='LogProcessor', window_size=20, by_epoch=True, _scope_='mmaction')\n",
+            "vis_backends = [dict(type='LocalVisBackend', _scope_='mmaction')]\n",
+            "visualizer = dict(\n",
+            "    type='ActionVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    _scope_='mmaction')\n",
+            "log_level = 'INFO'\n",
+            "load_from = 'https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth'\n",
+            "resume = False\n",
+            "url = 'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth'\n",
+            "num_classes = 66\n",
+            "model = dict(\n",
+            "    type='FastRCNN',\n",
+            "    _scope_='mmdet',\n",
+            "    init_cfg=dict(\n",
+            "        type='Pretrained',\n",
+            "        checkpoint=\n",
+            "        'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth'\n",
+            "    ),\n",
+            "    backbone=dict(\n",
+            "        type='mmaction.ResNet3dSlowOnly',\n",
+            "        depth=50,\n",
+            "        pretrained=None,\n",
+            "        pretrained2d=False,\n",
+            "        lateral=False,\n",
+            "        num_stages=4,\n",
+            "        conv1_kernel=(1, 7, 7),\n",
+            "        conv1_stride_t=1,\n",
+            "        pool1_stride_t=1,\n",
+            "        spatial_strides=(1, 2, 2, 1)),\n",
+            "    roi_head=dict(\n",
+            "        type='AVARoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor3D',\n",
+            "            roi_layer_type='RoIAlign',\n",
+            "            output_size=8,\n",
+            "            with_temporal_pool=True),\n",
+            "        bbox_head=dict(\n",
+            "            type='BBoxHeadAVA',\n",
+            "            in_channels=2048,\n",
+            "            num_classes=66,\n",
+            "            multilabel=False,\n",
+            "            dropout_ratio=0.5)),\n",
+            "    data_preprocessor=dict(\n",
+            "        type='mmaction.ActionDataPreprocessor',\n",
+            "        mean=[123.675, 116.28, 103.53],\n",
+            "        std=[58.395, 57.12, 57.375],\n",
+            "        format_shape='NCTHW'),\n",
+            "    train_cfg=dict(\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssignerAVA',\n",
+            "                pos_iou_thr=0.9,\n",
+            "                neg_iou_thr=0.9,\n",
+            "                min_pos_iou=0.9),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=32,\n",
+            "                pos_fraction=1,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=1.0)),\n",
+            "    test_cfg=dict(rcnn=None))\n",
+            "dataset_type = 'AVADataset'\n",
+            "data_root = 'data/multisports/trainval'\n",
+            "anno_root = 'data/multisports/annotations'\n",
+            "ann_file_train = 'data/multisports/annotations/multisports_train.csv'\n",
+            "ann_file_val = 'data/multisports/annotations/multisports_val.csv'\n",
+            "gt_file = 'data/multisports/annotations/multisports_GT.pkl'\n",
+            "proposal_file_train = 'data/multisports/annotations/multisports_proposals_train.pkl'\n",
+            "proposal_file_val = 'data/multisports/annotations/multisports_proposals_val.pkl'\n",
+            "file_client_args = dict(io_backend='disk')\n",
+            "train_pipeline = [\n",
+            "    dict(type='DecordInit', io_backend='disk', _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='SampleAVAFrames',\n",
+            "        clip_len=4,\n",
+            "        frame_interval=16,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='DecordDecode', _scope_='mmaction'),\n",
+            "    dict(type='RandomRescale', scale_range=(256, 320), _scope_='mmaction'),\n",
+            "    dict(type='RandomCrop', size=256, _scope_='mmaction'),\n",
+            "    dict(type='Flip', flip_ratio=0.5, _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='FormatShape',\n",
+            "        input_format='NCTHW',\n",
+            "        collapse=True,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='PackActionInputs', _scope_='mmaction')\n",
+            "]\n",
+            "val_pipeline = [\n",
+            "    dict(type='DecordInit', io_backend='disk', _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='SampleAVAFrames',\n",
+            "        clip_len=4,\n",
+            "        frame_interval=16,\n",
+            "        test_mode=True,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='DecordDecode', _scope_='mmaction'),\n",
+            "    dict(type='Resize', scale=(-1, 256), _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='FormatShape',\n",
+            "        input_format='NCTHW',\n",
+            "        collapse=True,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='PackActionInputs', _scope_='mmaction')\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True, _scope_='mmaction'),\n",
+            "    dataset=dict(\n",
+            "        type='AVADataset',\n",
+            "        ann_file='data/multisports/annotations/multisports_train.csv',\n",
+            "        pipeline=[\n",
+            "            dict(type='DecordInit', io_backend='disk'),\n",
+            "            dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),\n",
+            "            dict(type='DecordDecode'),\n",
+            "            dict(type='RandomRescale', scale_range=(256, 320)),\n",
+            "            dict(type='RandomCrop', size=256),\n",
+            "            dict(type='Flip', flip_ratio=0.5),\n",
+            "            dict(type='FormatShape', input_format='NCTHW', collapse=True),\n",
+            "            dict(type='PackActionInputs')\n",
+            "        ],\n",
+            "        num_classes=66,\n",
+            "        proposal_file=\n",
+            "        'data/multisports/annotations/multisports_proposals_train.pkl',\n",
+            "        data_prefix=dict(img='data/multisports/trainval'),\n",
+            "        timestamp_start=1,\n",
+            "        start_index=0,\n",
+            "        use_frames=False,\n",
+            "        fps=1,\n",
+            "        _scope_='mmaction'))\n",
+            "val_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False, _scope_='mmaction'),\n",
+            "    dataset=dict(\n",
+            "        type='AVADataset',\n",
+            "        ann_file='data/multisports/annotations/multisports_val.csv',\n",
+            "        pipeline=[\n",
+            "            dict(type='DecordInit', io_backend='disk'),\n",
+            "            dict(\n",
+            "                type='SampleAVAFrames',\n",
+            "                clip_len=4,\n",
+            "                frame_interval=16,\n",
+            "                test_mode=True),\n",
+            "            dict(type='DecordDecode'),\n",
+            "            dict(type='Resize', scale=(-1, 256)),\n",
+            "            dict(type='FormatShape', input_format='NCTHW', collapse=True),\n",
+            "            dict(type='PackActionInputs')\n",
+            "        ],\n",
+            "        num_classes=66,\n",
+            "        proposal_file=\n",
+            "        'data/multisports/annotations/multisports_proposals_val.pkl',\n",
+            "        data_prefix=dict(img='data/multisports/trainval'),\n",
+            "        test_mode=True,\n",
+            "        timestamp_start=1,\n",
+            "        start_index=0,\n",
+            "        use_frames=False,\n",
+            "        fps=1,\n",
+            "        _scope_='mmaction'))\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=8,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False, _scope_='mmaction'),\n",
+            "    dataset=dict(\n",
+            "        type='AVADataset',\n",
+            "        ann_file='data/multisports/annotations/multisports_val.csv',\n",
+            "        pipeline=[\n",
+            "            dict(type='DecordInit', io_backend='disk'),\n",
+            "            dict(\n",
+            "                type='SampleAVAFrames',\n",
+            "                clip_len=4,\n",
+            "                frame_interval=16,\n",
+            "                test_mode=True),\n",
+            "            dict(type='DecordDecode'),\n",
+            "            dict(type='Resize', scale=(-1, 256)),\n",
+            "            dict(type='FormatShape', input_format='NCTHW', collapse=True),\n",
+            "            dict(type='PackActionInputs')\n",
+            "        ],\n",
+            "        num_classes=66,\n",
+            "        proposal_file=\n",
+            "        'data/multisports/annotations/multisports_dense_proposals_val.recall_96.13.pkl',\n",
+            "        data_prefix=dict(img='data/multisports/trainval'),\n",
+            "        test_mode=True,\n",
+            "        timestamp_start=1,\n",
+            "        start_index=0,\n",
+            "        use_frames=False,\n",
+            "        fps=1,\n",
+            "        _scope_='mmaction'))\n",
+            "val_evaluator = dict(\n",
+            "    type='MultiSportsMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_GT.pkl',\n",
+            "    _scope_='mmaction')\n",
+            "test_evaluator = dict(\n",
+            "    type='MultiSportsMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_GT.pkl',\n",
+            "    _scope_='mmaction')\n",
+            "train_cfg = dict(\n",
+            "    type='EpochBasedTrainLoop',\n",
+            "    max_epochs=8,\n",
+            "    val_begin=1,\n",
+            "    val_interval=1,\n",
+            "    _scope_='mmaction')\n",
+            "val_cfg = dict(type='ValLoop', _scope_='mmaction')\n",
+            "test_cfg = dict(type='TestLoop', _scope_='mmaction')\n",
+            "param_scheduler = [\n",
+            "    dict(\n",
+            "        type='LinearLR',\n",
+            "        start_factor=0.1,\n",
+            "        by_epoch=True,\n",
+            "        begin=0,\n",
+            "        end=5,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='MultiStepLR',\n",
+            "        begin=0,\n",
+            "        end=8,\n",
+            "        by_epoch=True,\n",
+            "        milestones=[6, 7],\n",
+            "        gamma=0.1,\n",
+            "        _scope_='mmaction')\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    optimizer=dict(\n",
+            "        type='SGD',\n",
+            "        lr=0.01,\n",
+            "        momentum=0.9,\n",
+            "        weight_decay=1e-05,\n",
+            "        _scope_='mmaction'),\n",
+            "    clip_grad=dict(max_norm=5, norm_type=2))\n",
+            "launcher = 'none'\n",
+            "work_dir = 'work_dirs/stad_model/'\n",
+            "randomness = dict(seed=None, diff_rank_seed=False, deterministic=False)\n",
+            "\n",
+            "06/15 06:51:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "06/15 06:51:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+            "before_run:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "before_train:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_train_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DistSamplerSeedHook                \n",
+            " -------------------- \n",
+            "before_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) SyncBuffersHook                    \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_val_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) SyncBuffersHook                    \n",
+            " -------------------- \n",
+            "before_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_val_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train:\n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_test_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_test_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_run:\n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "06/15 06:51:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - 236 out of 236 frames are valid.\n",
+            "06/15 06:51:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - 120 out of 120 frames are valid.\n",
+            "06/15 06:51:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - load model from: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n",
+            "06/15 06:51:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n",
+            "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\" to /root/.cache/torch/hub/checkpoints/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n",
+            "100% 124M/124M [00:05<00:00, 25.9MB/s]\n",
+            "06/15 06:51:12 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
+            "\n",
+            "unexpected key in source state_dict: cls_head.fc_cls.weight, cls_head.fc_cls.bias\n",
+            "\n",
+            "missing keys in source state_dict: roi_head.bbox_head.fc_cls.weight, roi_head.bbox_head.fc_cls.bias\n",
+            "\n",
+            "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\" to /root/.cache/torch/hub/checkpoints/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "100% 122M/122M [00:04<00:00, 29.7MB/s]\n",
+            "06/15 06:51:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "06/15 06:51:17 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n",
+            "06/15 06:51:17 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n",
+            "06/15 06:51:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Checkpoints will be saved to /content/mmaction2/projects/stad_tutorial/work_dirs/stad_model.\n",
+            "06/15 06:51:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 20/118]  lr: 1.0000e-03  eta: 0:07:06  time: 0.4613  data_time: 0.0472  memory: 1381  grad_norm: 17.8613  loss: 1.1505  recall@thr=0.5: 0.6667  prec@thr=0.5: 0.6667  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 1.1505\n",
+            "06/15 06:51:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 40/118]  lr: 1.0000e-03  eta: 0:05:28  time: 0.2655  data_time: 0.0204  memory: 1381  grad_norm: 6.8642  loss: 0.5417  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.5417\n",
+            "06/15 06:51:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 60/118]  lr: 1.0000e-03  eta: 0:05:06  time: 0.3121  data_time: 0.0505  memory: 1381  grad_norm: 5.3190  loss: 0.6625  recall@thr=0.5: 0.9000  prec@thr=0.5: 0.9000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.6625\n",
+            "06/15 06:51:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 80/118]  lr: 1.0000e-03  eta: 0:04:44  time: 0.2771  data_time: 0.0255  memory: 1381  grad_norm: 3.0057  loss: 0.6646  recall@thr=0.5: 0.9231  prec@thr=0.5: 0.9231  recall@top3: 0.9231  prec@top3: 0.3077  recall@top5: 0.9231  prec@top5: 0.1846  loss_action_cls: 0.6646\n",
+            "06/15 06:51:48 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][100/118]  lr: 1.0000e-03  eta: 0:04:26  time: 0.2625  data_time: 0.0130  memory: 1381  grad_norm: 1.8442  loss: 0.5711  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.5711\n",
+            "06/15 06:51:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:51:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][118/118]  lr: 1.0000e-03  eta: 0:04:18  time: 0.2930  data_time: 0.0322  memory: 1381  grad_norm: 2.5183  loss: 0.6887  recall@thr=0.5: 0.6923  prec@thr=0.5: 0.6923  recall@top3: 0.6923  prec@top3: 0.2308  recall@top5: 0.6923  prec@top5: 0.1385  loss_action_cls: 0.6887\n",
+            "06/15 06:51:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 1 epochs\n",
+            "06/15 06:51:59 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 20/120]    eta: 0:00:14  time: 0.1446  data_time: 0.0853  memory: 466  \n",
+            "06/15 06:52:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 40/120]    eta: 0:00:10  time: 0.1124  data_time: 0.0612  memory: 466  \n",
+            "06/15 06:52:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 60/120]    eta: 0:00:07  time: 0.1016  data_time: 0.0505  memory: 466  \n",
+            "06/15 06:52:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 80/120]    eta: 0:00:04  time: 0.1083  data_time: 0.0581  memory: 466  \n",
+            "06/15 06:52:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][100/120]    eta: 0:00:02  time: 0.1650  data_time: 0.1102  memory: 466  \n",
+            "06/15 06:52:11 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120]    eta: 0:00:00  time: 0.1410  data_time: 0.0866  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    47.41\n",
+            "aerobic split jump      30.01\n",
+            "aerobic scissors leap    88.94\n",
+            "aerobic turn            98.43\n",
+            "mAP                     66.20\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump    25.00\n",
+            "aerobic split jump      20.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     56.25\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    45.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     36.25\n",
+            "06/15 06:52:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120]    mAP/frameAP: 66.1965  mAP/v_map@0.2: 56.2500  mAP/v_map@0.5: 36.2500  mAP/v_map_0.05:0.45: 50.4167  mAP/v_map_0.10:0.90: 37.7963  mAP/v_map_0.50:0.95: 26.8167  data_time: 0.0753  time: 0.1288\n",
+            "06/15 06:52:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - The best checkpoint with 66.1965 mAP/frameAP at 1 epoch is saved to best_mAP_frameAP_epoch_1.pth.\n",
+            "06/15 06:52:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 20/118]  lr: 3.2500e-03  eta: 0:04:11  time: 0.3098  data_time: 0.0484  memory: 1381  grad_norm: 1.1745  loss: 0.4384  recall@thr=0.5: 0.7857  prec@thr=0.5: 0.7857  recall@top3: 0.9286  prec@top3: 0.3095  recall@top5: 0.9286  prec@top5: 0.1857  loss_action_cls: 0.4384\n",
+            "06/15 06:52:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 40/118]  lr: 3.2500e-03  eta: 0:04:06  time: 0.3245  data_time: 0.0667  memory: 1381  grad_norm: 1.0271  loss: 0.3960  recall@thr=0.5: 0.9333  prec@thr=0.5: 0.9333  recall@top3: 0.9333  prec@top3: 0.3111  recall@top5: 0.9333  prec@top5: 0.1867  loss_action_cls: 0.3960\n",
+            "06/15 06:52:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 60/118]  lr: 3.2500e-03  eta: 0:03:55  time: 0.2572  data_time: 0.0111  memory: 1381  grad_norm: 0.8150  loss: 0.3958  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3958\n",
+            "06/15 06:52:41 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 80/118]  lr: 3.2500e-03  eta: 0:03:47  time: 0.2843  data_time: 0.0167  memory: 1381  grad_norm: 1.4691  loss: 0.4575  recall@thr=0.5: 0.9333  prec@thr=0.5: 0.9333  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4575\n",
+            "06/15 06:52:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][100/118]  lr: 3.2500e-03  eta: 0:03:41  time: 0.3118  data_time: 0.0559  memory: 1381  grad_norm: 1.9420  loss: 0.5529  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.5529\n",
+            "06/15 06:52:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:52:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][118/118]  lr: 3.2500e-03  eta: 0:03:33  time: 0.2532  data_time: 0.0082  memory: 1381  grad_norm: 1.6790  loss: 0.4253  recall@thr=0.5: 0.7500  prec@thr=0.5: 0.7500  recall@top3: 0.8333  prec@top3: 0.2778  recall@top5: 0.8333  prec@top5: 0.1667  loss_action_cls: 0.4253\n",
+            "06/15 06:52:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 2 epochs\n",
+            "06/15 06:52:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 20/120]    eta: 0:00:15  time: 0.1515  data_time: 0.0968  memory: 466  \n",
+            "06/15 06:53:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 40/120]    eta: 0:00:12  time: 0.1679  data_time: 0.1143  memory: 466  \n",
+            "06/15 06:53:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 60/120]    eta: 0:00:08  time: 0.1134  data_time: 0.0631  memory: 466  \n",
+            "06/15 06:53:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 80/120]    eta: 0:00:05  time: 0.0961  data_time: 0.0459  memory: 466  \n",
+            "06/15 06:53:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][100/120]    eta: 0:00:02  time: 0.1063  data_time: 0.0549  memory: 466  \n",
+            "06/15 06:53:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120]    eta: 0:00:00  time: 0.1017  data_time: 0.0522  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    42.09\n",
+            "aerobic split jump      27.71\n",
+            "aerobic scissors leap    90.02\n",
+            "aerobic turn            95.76\n",
+            "mAP                     63.89\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump      20.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     55.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     34.00\n",
+            "06/15 06:53:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120]    mAP/frameAP: 63.8934  mAP/v_map@0.2: 55.0000  mAP/v_map@0.5: 34.0000  mAP/v_map_0.05:0.45: 51.8889  mAP/v_map_0.10:0.90: 34.0278  mAP/v_map_0.50:0.95: 18.7250  data_time: 0.0710  time: 0.1226\n",
+            "06/15 06:53:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 20/118]  lr: 5.5000e-03  eta: 0:03:34  time: 0.4330  data_time: 0.1493  memory: 1381  grad_norm: 0.4795  loss: 0.5049  recall@thr=0.5: 0.8462  prec@thr=0.5: 0.8462  recall@top3: 0.8462  prec@top3: 0.2821  recall@top5: 0.8462  prec@top5: 0.1692  loss_action_cls: 0.5049\n",
+            "06/15 06:53:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 40/118]  lr: 5.5000e-03  eta: 0:03:27  time: 0.2948  data_time: 0.0370  memory: 1381  grad_norm: 0.8584  loss: 0.4820  recall@thr=0.5: 0.6154  prec@thr=0.5: 0.6154  recall@top3: 0.6154  prec@top3: 0.2051  recall@top5: 0.6154  prec@top5: 0.1231  loss_action_cls: 0.4820\n",
+            "06/15 06:53:28 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 60/118]  lr: 5.5000e-03  eta: 0:03:19  time: 0.2622  data_time: 0.0118  memory: 1381  grad_norm: 1.1041  loss: 0.2944  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2944\n",
+            "06/15 06:53:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 80/118]  lr: 5.5000e-03  eta: 0:03:13  time: 0.3111  data_time: 0.0470  memory: 1381  grad_norm: 0.8394  loss: 0.3393  recall@thr=0.5: 0.9091  prec@thr=0.5: 0.9091  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3393\n",
+            "06/15 06:53:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][100/118]  lr: 5.5000e-03  eta: 0:03:06  time: 0.2989  data_time: 0.0417  memory: 1381  grad_norm: 0.2155  loss: 0.4345  recall@thr=0.5: 0.8182  prec@thr=0.5: 0.8182  recall@top3: 0.8182  prec@top3: 0.2727  recall@top5: 0.8182  prec@top5: 0.1636  loss_action_cls: 0.4345\n",
+            "06/15 06:53:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:53:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][118/118]  lr: 5.5000e-03  eta: 0:02:59  time: 0.2576  data_time: 0.0112  memory: 1381  grad_norm: 0.2509  loss: 0.4634  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4634\n",
+            "06/15 06:53:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 3 epochs\n",
+            "06/15 06:53:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 20/120]    eta: 0:00:18  time: 0.1815  data_time: 0.1180  memory: 466  \n",
+            "06/15 06:53:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 40/120]    eta: 0:00:13  time: 0.1451  data_time: 0.0905  memory: 466  \n",
+            "06/15 06:53:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 60/120]    eta: 0:00:08  time: 0.1020  data_time: 0.0510  memory: 466  \n",
+            "06/15 06:53:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 80/120]    eta: 0:00:05  time: 0.1008  data_time: 0.0528  memory: 466  \n",
+            "06/15 06:54:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][100/120]    eta: 0:00:02  time: 0.1072  data_time: 0.0569  memory: 466  \n",
+            "06/15 06:54:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][120/120]    eta: 0:00:00  time: 0.1018  data_time: 0.0536  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    37.09\n",
+            "aerobic split jump      27.98\n",
+            "aerobic scissors leap    89.41\n",
+            "aerobic turn            95.67\n",
+            "mAP                     62.54\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump      20.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     55.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     34.00\n",
+            "06/15 06:54:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][120/120]    mAP/frameAP: 62.5361  mAP/v_map@0.2: 55.0000  mAP/v_map@0.5: 34.0000  mAP/v_map_0.05:0.45: 51.2222  mAP/v_map_0.10:0.90: 34.1389  mAP/v_map_0.50:0.95: 18.7250  data_time: 0.0704  time: 0.1229\n",
+            "06/15 06:54:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 20/118]  lr: 7.7500e-03  eta: 0:02:55  time: 0.3717  data_time: 0.0993  memory: 1381  grad_norm: 0.2139  loss: 0.3119  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3119\n",
+            "06/15 06:54:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 40/118]  lr: 7.7500e-03  eta: 0:02:48  time: 0.2730  data_time: 0.0230  memory: 1381  grad_norm: 0.6102  loss: 0.4782  recall@thr=0.5: 0.9375  prec@thr=0.5: 0.9375  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4782\n",
+            "06/15 06:54:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 60/118]  lr: 7.7500e-03  eta: 0:02:41  time: 0.2895  data_time: 0.0311  memory: 1381  grad_norm: 0.4057  loss: 0.3422  recall@thr=0.5: 0.9474  prec@thr=0.5: 0.9474  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3422\n",
+            "06/15 06:54:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 80/118]  lr: 7.7500e-03  eta: 0:02:36  time: 0.3170  data_time: 0.0490  memory: 1381  grad_norm: 0.3051  loss: 0.3628  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3628\n",
+            "06/15 06:54:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][100/118]  lr: 7.7500e-03  eta: 0:02:29  time: 0.2633  data_time: 0.0131  memory: 1381  grad_norm: 0.1671  loss: 0.3691  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3691\n",
+            "06/15 06:54:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:54:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][118/118]  lr: 7.7500e-03  eta: 0:02:23  time: 0.2721  data_time: 0.0181  memory: 1381  grad_norm: 0.1954  loss: 0.3076  recall@thr=0.5: 0.8571  prec@thr=0.5: 0.8571  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3076\n",
+            "06/15 06:54:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 4 epochs\n",
+            "06/15 06:54:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 20/120]    eta: 0:00:14  time: 0.1431  data_time: 0.0854  memory: 466  \n",
+            "06/15 06:54:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 40/120]    eta: 0:00:10  time: 0.1086  data_time: 0.0584  memory: 466  \n",
+            "06/15 06:54:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 60/120]    eta: 0:00:07  time: 0.1056  data_time: 0.0552  memory: 466  \n",
+            "06/15 06:54:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 80/120]    eta: 0:00:04  time: 0.0922  data_time: 0.0399  memory: 466  \n",
+            "06/15 06:54:51 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][100/120]    eta: 0:00:02  time: 0.1166  data_time: 0.0671  memory: 466  \n",
+            "06/15 06:54:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][120/120]    eta: 0:00:00  time: 0.1468  data_time: 0.0927  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    25.62\n",
+            "aerobic split jump      28.75\n",
+            "aerobic scissors leap    89.02\n",
+            "aerobic turn            93.30\n",
+            "mAP                     59.17\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump      20.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     50.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    45.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     36.25\n",
+            "06/15 06:54:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][120/120]    mAP/frameAP: 59.1749  mAP/v_map@0.2: 50.0000  mAP/v_map@0.5: 36.2500  mAP/v_map_0.05:0.45: 46.9444  mAP/v_map_0.10:0.90: 28.9352  mAP/v_map_0.50:0.95: 14.6667  data_time: 0.0663  time: 0.1186\n",
+            "06/15 06:55:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 20/118]  lr: 1.0000e-02  eta: 0:02:17  time: 0.3090  data_time: 0.0513  memory: 1381  grad_norm: 0.2988  loss: 0.3067  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3067\n",
+            "06/15 06:55:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 40/118]  lr: 1.0000e-02  eta: 0:02:10  time: 0.2584  data_time: 0.0142  memory: 1381  grad_norm: 0.6702  loss: 0.3996  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3996\n",
+            "06/15 06:55:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 60/118]  lr: 1.0000e-02  eta: 0:02:04  time: 0.3286  data_time: 0.0617  memory: 1381  grad_norm: 0.4347  loss: 0.4374  recall@thr=0.5: 0.8462  prec@thr=0.5: 0.8462  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4374\n",
+            "06/15 06:55:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 80/118]  lr: 1.0000e-02  eta: 0:01:58  time: 0.2774  data_time: 0.0247  memory: 1381  grad_norm: 0.4373  loss: 0.3679  recall@thr=0.5: 0.7500  prec@thr=0.5: 0.7500  recall@top3: 0.8750  prec@top3: 0.2917  recall@top5: 0.8750  prec@top5: 0.1750  loss_action_cls: 0.3679\n",
+            "06/15 06:55:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][100/118]  lr: 1.0000e-02  eta: 0:01:51  time: 0.2603  data_time: 0.0108  memory: 1381  grad_norm: 0.2507  loss: 0.3226  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3226\n",
+            "06/15 06:55:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:55:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][118/118]  lr: 1.0000e-02  eta: 0:01:46  time: 0.3256  data_time: 0.0497  memory: 1381  grad_norm: 0.0940  loss: 0.2914  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2914\n",
+            "06/15 06:55:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 5 epochs\n",
+            "06/15 06:55:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 20/120]    eta: 0:00:11  time: 0.1166  data_time: 0.0625  memory: 466  \n",
+            "06/15 06:55:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 40/120]    eta: 0:00:09  time: 0.1119  data_time: 0.0618  memory: 466  \n",
+            "06/15 06:55:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 60/120]    eta: 0:00:06  time: 0.1012  data_time: 0.0504  memory: 466  \n",
+            "06/15 06:55:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 80/120]    eta: 0:00:04  time: 0.1017  data_time: 0.0537  memory: 466  \n",
+            "06/15 06:55:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][100/120]    eta: 0:00:02  time: 0.1766  data_time: 0.1239  memory: 466  \n",
+            "06/15 06:55:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][120/120]    eta: 0:00:00  time: 0.1421  data_time: 0.0884  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    17.82\n",
+            "aerobic split jump      20.05\n",
+            "aerobic scissors leap    89.00\n",
+            "aerobic turn            91.20\n",
+            "mAP                     54.52\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn            60.00\n",
+            "mAP                     35.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    45.00\n",
+            "aerobic turn            26.67\n",
+            "mAP                     17.92\n",
+            "06/15 06:55:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][120/120]    mAP/frameAP: 54.5189  mAP/v_map@0.2: 35.0000  mAP/v_map@0.5: 17.9167  mAP/v_map_0.05:0.45: 31.2037  mAP/v_map_0.10:0.90: 19.0741  mAP/v_map_0.50:0.95: 9.5833  data_time: 0.0733  time: 0.1249\n",
+            "06/15 06:55:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 20/118]  lr: 1.0000e-02  eta: 0:01:40  time: 0.2867  data_time: 0.0385  memory: 1381  grad_norm: 0.1572  loss: 0.3008  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3008\n",
+            "06/15 06:55:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 40/118]  lr: 1.0000e-02  eta: 0:01:34  time: 0.2720  data_time: 0.0167  memory: 1381  grad_norm: 0.0803  loss: 0.2377  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2377\n",
+            "06/15 06:56:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 60/118]  lr: 1.0000e-02  eta: 0:01:28  time: 0.3423  data_time: 0.0840  memory: 1381  grad_norm: 0.3120  loss: 0.2442  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2442\n",
+            "06/15 06:56:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 80/118]  lr: 1.0000e-02  eta: 0:01:22  time: 0.2580  data_time: 0.0112  memory: 1381  grad_norm: 0.5726  loss: 0.3794  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3794\n",
+            "06/15 06:56:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][100/118]  lr: 1.0000e-02  eta: 0:01:16  time: 0.2949  data_time: 0.0347  memory: 1381  grad_norm: 0.1732  loss: 0.3004  recall@thr=0.5: 0.8750  prec@thr=0.5: 0.8750  recall@top3: 0.8750  prec@top3: 0.2917  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3004\n",
+            "06/15 06:56:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:56:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][118/118]  lr: 1.0000e-02  eta: 0:01:10  time: 0.3258  data_time: 0.0625  memory: 1381  grad_norm: 0.3709  loss: 0.3439  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3439\n",
+            "06/15 06:56:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 6 epochs\n",
+            "06/15 06:56:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 20/120]    eta: 0:00:11  time: 0.1169  data_time: 0.0624  memory: 466  \n",
+            "06/15 06:56:28 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 40/120]    eta: 0:00:09  time: 0.1131  data_time: 0.0631  memory: 466  \n",
+            "06/15 06:56:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 60/120]    eta: 0:00:06  time: 0.1064  data_time: 0.0553  memory: 466  \n",
+            "06/15 06:56:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 80/120]    eta: 0:00:04  time: 0.1401  data_time: 0.0862  memory: 466  \n",
+            "06/15 06:56:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][100/120]    eta: 0:00:02  time: 0.1519  data_time: 0.0982  memory: 466  \n",
+            "06/15 06:56:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][120/120]    eta: 0:00:00  time: 0.0986  data_time: 0.0486  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    19.05\n",
+            "aerobic split jump      22.20\n",
+            "aerobic scissors leap    85.83\n",
+            "aerobic turn            79.04\n",
+            "mAP                     51.53\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn             0.00\n",
+            "mAP                     20.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    45.00\n",
+            "aerobic turn             0.00\n",
+            "mAP                     11.25\n",
+            "06/15 06:56:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][120/120]    mAP/frameAP: 51.5300  mAP/v_map@0.2: 20.0000  mAP/v_map@0.5: 11.2500  mAP/v_map_0.05:0.45: 18.0556  mAP/v_map_0.10:0.90: 11.8519  mAP/v_map_0.50:0.95: 6.9167  data_time: 0.0688  time: 0.1209\n",
+            "06/15 06:56:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 20/118]  lr: 1.0000e-03  eta: 0:01:04  time: 0.2819  data_time: 0.0331  memory: 1381  grad_norm: 0.2811  loss: 0.2776  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2776\n",
+            "06/15 06:56:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 40/118]  lr: 1.0000e-03  eta: 0:00:58  time: 0.3114  data_time: 0.0473  memory: 1381  grad_norm: 0.1573  loss: 0.2043  recall@thr=0.5: 0.8182  prec@thr=0.5: 0.8182  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2043\n",
+            "06/15 06:56:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 60/118]  lr: 1.0000e-03  eta: 0:00:52  time: 0.2903  data_time: 0.0342  memory: 1381  grad_norm: 0.1343  loss: 0.3411  recall@thr=0.5: 0.8667  prec@thr=0.5: 0.8667  recall@top3: 0.8667  prec@top3: 0.2889  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3411\n",
+            "06/15 06:57:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 80/118]  lr: 1.0000e-03  eta: 0:00:46  time: 0.2623  data_time: 0.0128  memory: 1381  grad_norm: 0.1026  loss: 0.2895  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2895\n",
+            "06/15 06:57:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][100/118]  lr: 1.0000e-03  eta: 0:00:40  time: 0.3206  data_time: 0.0503  memory: 1381  grad_norm: 0.1911  loss: 0.3552  recall@thr=0.5: 0.7333  prec@thr=0.5: 0.7333  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3552\n",
+            "06/15 06:57:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:57:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][118/118]  lr: 1.0000e-03  eta: 0:00:35  time: 0.2884  data_time: 0.0335  memory: 1381  grad_norm: 0.1274  loss: 0.4391  recall@thr=0.5: 0.8571  prec@thr=0.5: 0.8571  recall@top3: 0.8571  prec@top3: 0.2857  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4391\n",
+            "06/15 06:57:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 7 epochs\n",
+            "06/15 06:57:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 20/120]    eta: 0:00:11  time: 0.1193  data_time: 0.0693  memory: 466  \n",
+            "06/15 06:57:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 40/120]    eta: 0:00:09  time: 0.1188  data_time: 0.0670  memory: 466  \n",
+            "06/15 06:57:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 60/120]    eta: 0:00:08  time: 0.1645  data_time: 0.1114  memory: 466  \n",
+            "06/15 06:57:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 80/120]    eta: 0:00:05  time: 0.1391  data_time: 0.0850  memory: 466  \n",
+            "06/15 06:57:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][100/120]    eta: 0:00:02  time: 0.1104  data_time: 0.0585  memory: 466  \n",
+            "06/15 06:57:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][120/120]    eta: 0:00:00  time: 0.1025  data_time: 0.0512  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    20.79\n",
+            "aerobic split jump      20.11\n",
+            "aerobic scissors leap    84.84\n",
+            "aerobic turn            78.58\n",
+            "mAP                     51.08\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn            20.00\n",
+            "mAP                     25.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    45.00\n",
+            "aerobic turn             0.00\n",
+            "mAP                     11.25\n",
+            "06/15 06:57:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][120/120]    mAP/frameAP: 51.0794  mAP/v_map@0.2: 25.0000  mAP/v_map@0.5: 11.2500  mAP/v_map_0.05:0.45: 22.5000  mAP/v_map_0.10:0.90: 14.0741  mAP/v_map_0.50:0.95: 6.9167  data_time: 0.0735  time: 0.1255\n",
+            "06/15 06:57:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 20/118]  lr: 1.0000e-04  eta: 0:00:29  time: 0.2894  data_time: 0.0322  memory: 1381  grad_norm: 0.1227  loss: 0.3286  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3286\n",
+            "06/15 06:57:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 40/118]  lr: 1.0000e-04  eta: 0:00:23  time: 0.4105  data_time: 0.1257  memory: 1381  grad_norm: 0.1948  loss: 0.3202  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3202\n",
+            "06/15 06:57:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 60/118]  lr: 1.0000e-04  eta: 0:00:17  time: 0.3095  data_time: 0.0537  memory: 1381  grad_norm: 0.7997  loss: 0.2428  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2428\n",
+            "06/15 06:57:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 80/118]  lr: 1.0000e-04  eta: 0:00:11  time: 0.2918  data_time: 0.0330  memory: 1381  grad_norm: 0.8157  loss: 0.3045  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3045\n",
+            "06/15 06:58:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][100/118]  lr: 1.0000e-04  eta: 0:00:05  time: 0.3443  data_time: 0.0786  memory: 1381  grad_norm: 0.0966  loss: 0.2605  recall@thr=0.5: 0.9375  prec@thr=0.5: 0.9375  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2605\n",
+            "06/15 06:58:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:58:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][118/118]  lr: 1.0000e-04  eta: 0:00:00  time: 0.2611  data_time: 0.0148  memory: 1381  grad_norm: 0.3034  loss: 0.2694  recall@thr=0.5: 0.9231  prec@thr=0.5: 0.9231  recall@top3: 0.9231  prec@top3: 0.3077  recall@top5: 0.9231  prec@top5: 0.1846  loss_action_cls: 0.2694\n",
+            "06/15 06:58:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 8 epochs\n",
+            "06/15 06:58:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 20/120]    eta: 0:00:14  time: 0.1433  data_time: 0.0869  memory: 466  \n",
+            "06/15 06:58:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 40/120]    eta: 0:00:12  time: 0.1664  data_time: 0.1160  memory: 466  \n",
+            "06/15 06:58:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 60/120]    eta: 0:00:08  time: 0.1269  data_time: 0.0772  memory: 466  \n",
+            "06/15 06:58:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 80/120]    eta: 0:00:05  time: 0.0951  data_time: 0.0455  memory: 466  \n",
+            "06/15 06:58:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][100/120]    eta: 0:00:02  time: 0.1144  data_time: 0.0630  memory: 466  \n",
+            "06/15 06:58:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][120/120]    eta: 0:00:00  time: 0.1028  data_time: 0.0530  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    15.29\n",
+            "aerobic split jump      20.74\n",
+            "aerobic scissors leap    86.38\n",
+            "aerobic turn            80.98\n",
+            "mAP                     50.85\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn            20.00\n",
+            "mAP                     25.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    45.00\n",
+            "aerobic turn            20.00\n",
+            "mAP                     16.25\n",
+            "06/15 06:58:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][120/120]    mAP/frameAP: 50.8487  mAP/v_map@0.2: 25.0000  mAP/v_map@0.5: 16.2500  mAP/v_map_0.05:0.45: 23.0556  mAP/v_map_0.10:0.90: 15.1852  mAP/v_map_0.50:0.95: 8.4167  data_time: 0.0732  time: 0.1244\n",
+            "\u001b[32mTraining finished successfully. \u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "# 使用 MIM 训练模型\n",
+        "!mim train mmaction2 configs/slowonly_k400_multisports.py \\\n",
+        "    --work-dir work_dirs/stad_model/"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HCg6C9HYxRAt"
+      },
+      "source": [
+        "## 4. 时空行为检测模型推理\n",
+        "\n",
+        "训练得到检测模型和时空行为检测模型后，我们可以利用时空行为检测 demo 进行推理，可视化模型效果。\n",
+        "\n",
+        "由于 tutorial 中使用的训练数据较少，模型性能较差，所以可视化时使用预先训练好的模型。"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WW5-IJ7IxRAu"
+      },
+      "source": [
+        "###"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "FofW_5RoxRAu",
+        "outputId": "91217660-946d-48ab-f663-b0f7f2d6a6f6"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "ALSA lib confmisc.c:767:(parse_card) cannot find card '0'\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory\n",
+            "ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory\n",
+            "ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default\n",
+            "ALSA lib confmisc.c:767:(parse_card) cannot find card '0'\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory\n",
+            "ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory\n",
+            "ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default\n",
+            "Loads checkpoint by local backend from path: work_dirs/det_model/epoch_2.pth\n",
+            "Performing Human Detection for each frame\n",
+            "[>>] 99/99, 6.8 task/s, elapsed: 15s, ETA:     0s\n",
+            "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "Performing SpatioTemporal Action Detection for each clip\n",
+            "[>>] 99/99, 16.6 task/s, elapsed: 6s, ETA:     0sPerforming visualization\n",
+            "Moviepy - Building video data/demo_spatiotemporal_det.mp4.\n",
+            "Moviepy - Writing video data/demo_spatiotemporal_det.mp4\n",
+            "\n",
+            "Moviepy - Done !\n",
+            "Moviepy - video ready data/demo_spatiotemporal_det.mp4\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python ../../demo/demo_spatiotemporal_det.py \\\n",
+        "    data/multisports/test/aerobic_gymnastics/v_7G_IpU0FxLU_c001.mp4 \\\n",
+        "    data/demo_spatiotemporal_det.mp4 \\\n",
+        "    --config configs/slowonly_k400_multisports.py \\\n",
+        "    --checkpoint https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth \\\n",
+        "    --det-config configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n",
+        "    --det-checkpoint work_dirs/det_model/epoch_2.pth \\\n",
+        "    --det-score-thr 0.85 \\\n",
+        "    --action-score-thr 0.8 \\\n",
+        "    --label-map ../../tools/data/multisports/label_map.txt \\\n",
+        "    --predict-stepsize 8 \\\n",
+        "    --output-stepsize 1 \\\n",
+        "    --output-fps 24"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 741
+        },
+        "id": "677FUWFRxRAv",
+        "outputId": "f702d544-3492-494c-af81-9e90f43d6b6c"
+      },
+      "outputs": [],
+      "source": [
+        "# Show Video\n",
+        "import moviepy.editor\n",
+        "moviepy.editor.ipython_display(\"data/demo_spatiotemporal_det.mp4\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.0"
+    },
+    "orig_nbformat": 4
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/projects/stad_tutorial/tools/convert_proposals.py b/projects/stad_tutorial/tools/convert_proposals.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ea7af4b26cd64381c993c5e3d2942f4ccf542a0
--- /dev/null
+++ b/projects/stad_tutorial/tools/convert_proposals.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import numpy as np
+from mmengine import dump, load, track_iter_progress
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--det_test_result',
+        default='data/multisports/annotations/ms_det_proposals.pkl')
+    parser.add_argument(
+        '--stad_gt',
+        help='spatio-temporal action detection ground truth file',
+        default='data/multisports/annotations/multisports_GT.pkl')
+    parser.add_argument(
+        '--out_result',
+        default='data/multisports/annotations/multisports_proposals.pkl')
+    args = parser.parse_args()
+    return args
+
+
+def dump_det_result(args):
+    print('loading test result...')
+    det_result = load(args.det_test_result)
+    stad_gt = load(args.stad_gt)
+    train_list = stad_gt['train_videos'][0]
+    val_list = stad_gt['test_videos'][0]
+    train_bbox_result = {}
+    val_bbox_result = {}
+    for sample in track_iter_progress(det_result):
+        bboxes = sample['pred_instances']['bboxes']
+        scores = sample['pred_instances']['scores']
+        h, w = sample['ori_shape']
+        bboxes[:, ::2] /= w
+        bboxes[:, 1::2] /= h
+        img_path = sample['img_path']
+        frm_key_list = img_path.split('.jpg')[0].split('/')
+        frm_key = ','.join([
+            f'{frm_key_list[-3]}/{frm_key_list[-2]}.mp4',
+            f'{int(frm_key_list[-1]):04d}'
+        ])
+        bbox = np.concatenate([bboxes, scores[:, None]], axis=1)
+
+        vid_key = '/'.join(frm_key_list[-3:-1])
+        if vid_key in train_list:
+            train_bbox_result[frm_key] = bbox
+        elif vid_key in val_list:
+            val_bbox_result[frm_key] = bbox
+        else:
+            raise KeyError(vid_key)
+    dump(train_bbox_result, args.out_result[:-4] + '_train.pkl')
+    dump(val_bbox_result, args.out_result[:-4] + '_val.pkl')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    dump_det_result(args)
diff --git a/projects/stad_tutorial/tools/generate_mmdet_anno.py b/projects/stad_tutorial/tools/generate_mmdet_anno.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d26be41959f30dfb46b55cb65462e22ddcc2a98
--- /dev/null
+++ b/projects/stad_tutorial/tools/generate_mmdet_anno.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from collections import defaultdict
+
+from mmengine import dump, load
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'stad_anno', help='spatiotemporal action detection anno path')
+    parser.add_argument('det_path', help='output detection anno path')
+    args = parser.parse_args()
+    return args
+
+
+def generate_mmdet_coco_anno(args):
+    ori_anno = load(args.stad_anno)
+    train_videos = ori_anno['train_videos']
+    val_videos = ori_anno['test_videos']
+    videos = {'train': train_videos, 'val': val_videos}
+    for split in ['train', 'val']:
+        img_id = 0
+        bbox_id = 0
+        img_list = []
+        anno_list = []
+        for vid in videos[split][0]:
+            vid_tubes = ori_anno['gttubes'][vid]
+            height, width = ori_anno['resolution'][vid]
+            frm2bbox = defaultdict(list)
+            for label_idx, tube_list in vid_tubes.items():
+                for tube in tube_list:
+                    for frm_anno in tube:
+                        frm_idx, bbox = frm_anno[0], frm_anno[1:]
+                        frm2bbox[frm_idx].append({'label': 0, 'bbox': bbox})
+            for frm_idx, frm_bboxes in frm2bbox.items():
+                img_path = f'{vid}/{int(frm_idx):05d}.jpg'
+                img_instance = {
+                    'file_name': img_path,
+                    'height': height,
+                    'width': width,
+                    'id': img_id
+                }
+                img_list.append(img_instance)
+
+                for bbox_info in frm_bboxes:
+                    label = bbox_info['label']
+                    x1, y1, x2, y2 = bbox_info['bbox']
+                    bbox = [x1, y1, x2 - x1, y2 - y1]
+                    anno_instance = {
+                        'area': bbox[2] * bbox[3],
+                        'image_id': img_id,
+                        'bbox': bbox,
+                        'category_id': label,
+                        'iscrowd': 0,
+                        'id': bbox_id
+                    }
+                    anno_list.append(anno_instance)
+                    bbox_id += 1
+                img_id += 1
+        total_anno = {
+            'images': img_list,
+            'annotations': anno_list,
+            'categories': [{
+                'id': 0,
+                'name': 'person'
+            }],
+        }
+        dump(total_anno, args.det_path[:-5] + f'_{split}' + args.det_path[-5:])
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    generate_mmdet_coco_anno(args)
diff --git a/projects/stad_tutorial/tools/generate_rgb.py b/projects/stad_tutorial/tools/generate_rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..7520a8b4e069ebaf36de777b9a96aacddf9b8b68
--- /dev/null
+++ b/projects/stad_tutorial/tools/generate_rgb.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+
+import cv2
+
+src_dir = 'data/multisports/trainval'
+target_dir = 'data/multisports/rawframes'
+
+sport_list = ['aerobic_gymnastics']
+for sport in sport_list:
+    video_root = osp.join(src_dir, sport)
+    if not osp.exists(video_root):
+        print('No {} video dir to generate rgb images.'.format(video_root))
+        continue
+    print('Will generate {} rgb dir for {}.'.format(
+        len(os.listdir(video_root)), osp.basename(sport)))
+    for clip_name in os.listdir(video_root):
+        mp4_path = osp.join(video_root, clip_name)
+        save_dir = osp.join(target_dir, sport, clip_name[:-4])
+        if not osp.exists(save_dir):
+            os.makedirs(save_dir)
+        cap = cv2.VideoCapture(mp4_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        size = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
+                int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
+        fourcc = cv2.VideoWriter_fourcc(*'I420')
+        ii = 1
+        while (cap.isOpened()):
+            ret, frame = cap.read()
+            aa = str(ii)
+            s = aa.zfill(5)
+            image_name = osp.join(save_dir + '/' + s + '.jpg')
+            if ret is True:
+                cv2.imwrite(image_name, frame)
+            else:
+                break
+            ii = ii + 1
+        cap.release()
+        print('Generate {} rgb dir successfully.'.format(clip_name[:-4]))
diff --git a/projects/stad_tutorial/tools/images2coco.py b/projects/stad_tutorial/tools/images2coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0000cebb7f6496792c88110264f554c6affd46c9
--- /dev/null
+++ b/projects/stad_tutorial/tools/images2coco.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+
+from mmengine.fileio import dump, list_from_file
+from mmengine.utils import mkdir_or_exist, scandir, track_parallel_progress
+from PIL import Image
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert images to coco format without annotations')
+    parser.add_argument('img_path', help='The root path of images')
+    parser.add_argument(
+        'classes', type=str, help='The text file name of storage class list')
+    parser.add_argument(
+        'out',
+        type=str,
+        help='The output annotation json file name, The save dir is in the '
+        'same directory as img_path')
+    parser.add_argument(
+        '-e',
+        '--exclude-extensions',
+        type=str,
+        nargs='+',
+        help='The suffix of images to be excluded, such as "png" and "bmp"')
+    args = parser.parse_args()
+    return args
+
+
+def get_img_info(args):
+    path, image_path, exclude_extensions = args
+    if exclude_extensions is None or (
+            exclude_extensions is not None
+            and not image_path.lower().endswith(exclude_extensions)):
+        # image_path =
+        img_pillow = Image.open(os.path.join(path, image_path))
+        img_info = {
+            'filename': image_path,
+            'width': img_pillow.width,
+            'height': img_pillow.height,
+        }
+        return img_info
+
+
+def collect_image_infos(path, exclude_extensions=None):
+    img_infos = []
+
+    images_generator = scandir(path, recursive=True)
+
+    img_infos = track_parallel_progress(
+        get_img_info, [(path, image_path, exclude_extensions)
+                       for image_path in images_generator],
+        nproc=64)
+
+    return img_infos
+
+
+def cvt_to_coco_json(img_infos, classes):
+    image_id = 0
+    coco = dict()
+    coco['images'] = []
+    coco['type'] = 'instance'
+    coco['categories'] = []
+    coco['annotations'] = []
+    image_set = set()
+
+    for category_id, name in enumerate(classes):
+        category_item = dict()
+        category_item['supercategory'] = str('none')
+        category_item['id'] = int(category_id)
+        category_item['name'] = str(name)
+        coco['categories'].append(category_item)
+
+    for img_dict in img_infos:
+        file_name = img_dict['filename']
+        assert file_name not in image_set
+        image_item = dict()
+        image_item['id'] = int(image_id)
+        image_item['file_name'] = str(file_name)
+        image_item['height'] = int(img_dict['height'])
+        image_item['width'] = int(img_dict['width'])
+        coco['images'].append(image_item)
+        image_set.add(file_name)
+
+        image_id += 1
+    return coco
+
+
+def main():
+    args = parse_args()
+    assert args.out.endswith(
+        'json'), 'The output file name must be json suffix'
+
+    # 1 load image list info
+    img_infos = collect_image_infos(args.img_path, args.exclude_extensions)
+
+    # 2 convert to coco format data
+    classes = list_from_file(args.classes)
+    coco_info = cvt_to_coco_json(img_infos, classes)
+
+    # 3 dump
+    save_dir = os.path.join(args.img_path, '..', 'annotations')
+    mkdir_or_exist(save_dir)
+    save_path = os.path.join(save_dir, args.out)
+    dump(coco_info, save_path)
+    print(f'save json file: {save_path}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/projects/umt/README.md b/projects/umt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b1879d7824a036a80495b720d17e762c15f00282
--- /dev/null
+++ b/projects/umt/README.md
@@ -0,0 +1,93 @@
+# UMT Project
+
+[Unmasked Teacher: Towards Training-Efficient Video Foundation Models](https://arxiv.org/abs/2303.16058)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Video Foundation Models (VFMs) have received limited exploration due to high computational costs and data scarcity. Previous VFMs rely on Image Foundation Models (IFMs), which face challenges in transferring to the video domain. Although VideoMAE has trained a robust ViT from limited data, its low-level reconstruction poses convergence difficulties and conflicts with high-level cross-modal alignment. This paper proposes a training-efficient method for temporal-sensitive VFMs that integrates the benefits of existing methods. To increase data efficiency, we mask out most of the low-semantics video tokens, but selectively align the unmasked tokens with IFM, which serves as the UnMasked Teacher (UMT). By providing semantic guidance, our method enables faster convergence and multimodal friendliness. With a progressive pre-training framework, our model can handle various tasks including scene-related, temporal-related, and complex video-language understanding. Using only public sources for pre-training in 6 days on 32 A100 GPUs, our scratch-built ViT-L/16 achieves state-of-the-art performances on various video tasks.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://github-production-user-asset-6210df.s3.amazonaws.com/58767402/262291190-bdaa6899-e1d6-460f-b329-23d8b38511f3.png" width="800"/>
+</div>
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Installation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2.
+
+Assume that you are located at `$MMACTION2/projects/umt`.
+
+Add the current folder to `PYTHONPATH`, so that Python can find your code. Run the following command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the Kinetics dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/tree/main/tools/data/kinetics#readme).
+
+Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link.
+
+```shell
+ln -s ../../data ./data
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+### Kinetics400
+
+| frame sampling strategy | resolution | backbone |  pretrain   | top1 acc | testing protocol |                             config                              |                             ckpt                              |
+| :---------------------: | :--------: | :------: | :---------: | :------: | :--------------: | :-------------------------------------------------------------: | :-----------------------------------------------------------: |
+|        uniform 8        |  224x224   |  UMT-B   | Kinetics710 |  87.33   | 4 clips x 3 crop | [config](./configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/umt/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.pth) |
+|        uniform 8        |  224x224   |  UMT-L   | Kinetics710 |  90.21   | 4 clips x 3 crop | [config](./configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/umt/umt-large-p16-res224_kinetics710-pre-ft_u8_k400-rgb/umt-large-p16-res224_kinetics710-pre-ft_u8_k400-rgb.pth) |
+
+### Kinetics700
+
+| frame sampling strategy | resolution | backbone |  pretrain   | top1 acc | testing protocol |                             config                              |                             ckpt                              |
+| :---------------------: | :--------: | :------: | :---------: | :------: | :--------------: | :-------------------------------------------------------------: | :-----------------------------------------------------------: |
+|        uniform 8        |  224x224   |  UMT-B   | Kinetics710 |  77.95   | 4 clips x 3 crop | [config](./configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/umt/umt-base-p16-res224_kinetics710-pre-ft_u8_k700-rgb/umt-base-p16-res224_kinetics710-pre-ft_u8_k700-rgb.pth) |
+|        uniform 8        |  224x224   |  UMT-L   | Kinetics710 |  82.79   | 4 clips x 3 crop | [config](./configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/umt/umt-large-p16-res224_kinetics710-pre-ft_u8_k700-rgb/umt-large-p16-res224_kinetics710-pre-ft_u8_k700-rgb.pth) |
+
+## Citation
+
+<!-- Replace to the citation of the paper your project refers to. -->
+
+```bibtex
+@article{li2023unmasked,
+  title={Unmasked teacher: Towards training-efficient video foundation models},
+  author={Li, Kunchang and Wang, Yali and Li, Yizhuo and Wang, Yi and He, Yinan and Wang, Limin and Qiao, Yu},
+  journal={arXiv preprint arXiv:2303.16058},
+  year={2023}
+}
+```
diff --git a/projects/umt/configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py b/projects/umt/configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..6991f7aaa37cc6f51697491d3f692b7d7a1dbcf4
--- /dev/null
+++ b/projects/umt/configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py
@@ -0,0 +1,82 @@
+custom_imports = dict(imports='models')
+
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UMTViT',
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        all_frames=8,
+        qkv_bias=True),
+    cls_head=dict(
+        type='TimeSformerHead',
+        num_classes=400,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=8, num_clips=4, test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=16,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
+
+default_scope = 'mmaction'
+
+default_hooks = dict(
+    runtime_info=dict(type='RuntimeInfoHook'),
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=20, ignore_last=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', interval=1, save_best='auto', max_keep_ckpts=5),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    sync_buffers=dict(type='SyncBuffersHook'))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends)
+
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/projects/umt/configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py b/projects/umt/configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b050d984cab630a991ea413b30d7b14e848d10a
--- /dev/null
+++ b/projects/umt/configs/umt-base-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py
@@ -0,0 +1,82 @@
+custom_imports = dict(imports='models')
+
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UMTViT',
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        all_frames=8,
+        qkv_bias=True),
+    cls_head=dict(
+        type='TimeSformerHead',
+        num_classes=700,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics700/videos_val'
+ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=8, num_clips=4, test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=16,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
+
+default_scope = 'mmaction'
+
+default_hooks = dict(
+    runtime_info=dict(type='RuntimeInfoHook'),
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=20, ignore_last=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', interval=1, save_best='auto', max_keep_ckpts=5),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    sync_buffers=dict(type='SyncBuffersHook'))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends)
+
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/projects/umt/configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py b/projects/umt/configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8afaf17c8b2a4f4528ac313ede9810aa1c6edbf
--- /dev/null
+++ b/projects/umt/configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k400-rgb.py
@@ -0,0 +1,82 @@
+custom_imports = dict(imports='models')
+
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UMTViT',
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        all_frames=8,
+        qkv_bias=True),
+    cls_head=dict(
+        type='TimeSformerHead',
+        num_classes=400,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=8, num_clips=4, test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=16,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
+
+default_scope = 'mmaction'
+
+default_hooks = dict(
+    runtime_info=dict(type='RuntimeInfoHook'),
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=20, ignore_last=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', interval=1, save_best='auto', max_keep_ckpts=5),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    sync_buffers=dict(type='SyncBuffersHook'))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends)
+
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/projects/umt/configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py b/projects/umt/configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py
new file mode 100644
index 0000000000000000000000000000000000000000..33c7793f31faed2420ec02cd08fce10ba1aeddd6
--- /dev/null
+++ b/projects/umt/configs/umt-large-p16-res224_kinetics710-pre-ft_u8_k700-rgb.py
@@ -0,0 +1,82 @@
+custom_imports = dict(imports='models')
+
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UMTViT',
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        all_frames=8,
+        qkv_bias=True),
+    cls_head=dict(
+        type='TimeSformerHead',
+        num_classes=700,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics700/videos_val'
+ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=8, num_clips=4, test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=16,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
+
+default_scope = 'mmaction'
+
+default_hooks = dict(
+    runtime_info=dict(type='RuntimeInfoHook'),
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=20, ignore_last=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', interval=1, save_best='auto', max_keep_ckpts=5),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    sync_buffers=dict(type='SyncBuffersHook'))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(type='ActionVisualizer', vis_backends=vis_backends)
+
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/projects/umt/models/__init__.py b/projects/umt/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d822e47d27a4077e10e1d95fb4b6e8b77f0f355
--- /dev/null
+++ b/projects/umt/models/__init__.py
@@ -0,0 +1,3 @@
+from .vit import UMTViT
+
+__all__ = ['UMTViT']
diff --git a/projects/umt/models/vit.py b/projects/umt/models/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02e24f970bd25f0b8cedeecdc90caa619bea4ae
--- /dev/null
+++ b/projects/umt/models/vit.py
@@ -0,0 +1,344 @@
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from mmcv.cnn.bricks import DropPath
+from mmengine import to_2tuple
+
+from mmaction.registry import MODELS
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the original BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat(
+                (self.q_bias,
+                 torch.zeros_like(self.v_bias,
+                                  requires_grad=False), self.v_bias))
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 init_values=None,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            attn_head_dim=attn_head_dim)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+        if init_values > 0:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 num_frames=16,
+                 tubelet_size=2):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.tubelet_size = int(tubelet_size)
+        num_patches = (img_size[1] //
+                       patch_size[1]) * (img_size[0] // patch_size[0]) * (
+                           num_frames // self.tubelet_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv3d(
+            in_channels=in_chans,
+            out_channels=embed_dim,
+            kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]),
+            stride=(self.tubelet_size, patch_size[0], patch_size[1]))
+
+    def forward(self, x):
+        B, C, T, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model " \
+            f'({self.img_size[0]}*{self.img_size[1]}).'
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+# sin-cos position encoding
+def get_sinusoid_encoding_table(n_position,
+                                d_hid,
+                                cur_frame=-1,
+                                pre_n_position=1568):
+    """Sinusoid position encoding table."""
+
+    def get_position_angle_vec(position):
+        return [
+            position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+            for hid_j in range(d_hid)
+        ]
+
+    sinusoid_table = np.array(
+        [get_position_angle_vec(pos_i) for pos_i in range(pre_n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+    sinusoid_table = torch.tensor(
+        sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
+    print(f'n_position: {n_position}')
+    print(f'pre_n_position: {pre_n_position}')
+    if n_position // cur_frame * 8 != pre_n_position and cur_frame != -1:
+        T = 8  # checkpoint frame
+        P = 14  # checkpoint size
+        C = d_hid
+        new_P = int((n_position // cur_frame)**0.5)  # testing size
+        print(
+            f'Pretraining uses 14x14, but current version is {new_P}x{new_P}')
+        print('Interpolate the position embedding')
+        sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
+        sinusoid_table = sinusoid_table.reshape(-1, P, P,
+                                                C).permute(0, 3, 1, 2)
+        sinusoid_table = torch.nn.functional.interpolate(
+            sinusoid_table,
+            size=(new_P, new_P),
+            mode='bicubic',
+            align_corners=False)
+        # BT, C, H, W -> BT, H, W, C ->  B, T, H, W, C
+        sinusoid_table = sinusoid_table.permute(0, 2, 3, 1).reshape(
+            -1, T, new_P, new_P, C)
+        sinusoid_table = sinusoid_table.flatten(1, 3)
+    if cur_frame != -1 and cur_frame != 8:
+        print(f'Pretraining uses 8 frames, but current frame is {cur_frame}')
+        print('Interpolate the position embedding')
+        T = 8  # checkpoint frame
+        new_T = cur_frame  # testing frame
+        # interpolate
+        P = int((n_position // cur_frame)**0.5)  # testing size
+        C = d_hid
+        sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
+        sinusoid_table = sinusoid_table.permute(0, 2, 3, 4,
+                                                1).reshape(-1, C,
+                                                           T)  # BHW, C, T
+        sinusoid_table = torch.nn.functional.interpolate(
+            sinusoid_table, size=new_T, mode='linear')
+        sinusoid_table = sinusoid_table.reshape(1, P, P, C, new_T).permute(
+            0, 4, 1, 2, 3)  # B, T, H, W, C
+        sinusoid_table = sinusoid_table.flatten(1, 3)
+    if n_position == pre_n_position:
+        return sinusoid_table
+    else:
+        print('Use learnable position embedding')
+        return nn.Parameter(sinusoid_table, requires_grad=True)
+
+
+@MODELS.register_module()
+class UMTViT(nn.Module):
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 init_values=0.,
+                 use_learnable_pos_emb=False,
+                 all_frames=16,
+                 tubelet_size=1,
+                 use_checkpoint=False,
+                 checkpoint_num=0,
+                 use_mean_pooling=True):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim
+        self.tubelet_size = tubelet_size
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            num_frames=all_frames,
+            tubelet_size=self.tubelet_size)
+        num_patches = self.patch_embed.num_patches
+        self.use_checkpoint = use_checkpoint
+        self.checkpoint_num = checkpoint_num
+        print(f'Use checkpoint: {use_checkpoint}')
+        print(f'Checkpoint number: {checkpoint_num}')
+
+        if use_learnable_pos_emb:
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches, embed_dim))
+        else:
+            # sine-cosine positional embeddings is on the way
+            if patch_size == 14:
+                pre_n_position = 2048
+            else:
+                pre_n_position = 1568
+            self.pos_embed = get_sinusoid_encoding_table(
+                num_patches,
+                embed_dim,
+                all_frames // tubelet_size,
+                pre_n_position=pre_n_position)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                init_values=init_values) for i in range(depth)
+        ])
+        self.norm = nn.Identity() if use_mean_pooling else norm_layer(
+            embed_dim)
+        self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        B, _, _ = x.size()
+
+        if self.pos_embed is not None:
+            x = x + self.pos_embed.expand(B, -1, -1).type_as(x).to(
+                x.device).clone().detach()
+        x = self.pos_drop(x)
+
+        for idx, blk in enumerate(self.blocks):
+            if self.use_checkpoint and idx < self.checkpoint_num:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+
+        x = self.norm(x)
+        if self.fc_norm is not None:
+            return self.fc_norm(x.mean(1))
+        else:
+            return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..9f24b7e3d2d77cbe3ebeeec91ec6c84ada508ee5
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "genvidbench"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "ruff>=0.13.2",
+]
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..75ee2915d5bf69cb064b34aa22fa73d56e6b24ab
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,33 @@
+# Core dependencies for Hugging Face Spaces
+torch>=1.13.0
+torchvision>=0.14.0
+torchaudio>=0.13.0
+
+# MMAction2 dependencies
+mmcv>=2.0.0,<2.2.0
+mmengine>=0.7.1
+mmdet>=3.0.0
+
+# Video processing
+opencv-python>=4.6.0
+decord>=0.6.0
+av>=9.0.0
+moviepy>=1.0.3
+
+# Core ML libraries
+numpy>=1.21.0
+scipy>=1.9.0
+Pillow>=9.0.0
+matplotlib>=3.5.0
+
+# Gradio for web interface
+gradio>=4.0.0
+
+# Additional dependencies
+einops>=0.6.0
+timm>=0.9.0
+transformers>=4.28.0
+
+# Optional but recommended
+librosa>=0.9.0
+soundfile>=0.12.0
\ No newline at end of file
diff --git a/requirements/build.txt b/requirements/build.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9c94c9e11d4294978873d5a28252fa577df76dba
--- /dev/null
+++ b/requirements/build.txt
@@ -0,0 +1,8 @@
+decord >= 0.4.1
+einops
+matplotlib
+numpy
+opencv-contrib-python
+Pillow
+scipy
+torch>=1.3
diff --git a/requirements/docs.txt b/requirements/docs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3a24c45daba67d1fbff9bd192ccabf6669d10dc8
--- /dev/null
+++ b/requirements/docs.txt
@@ -0,0 +1,14 @@
+docutils==0.18.1
+einops
+modelindex
+myst-parser
+opencv-python
+-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+scipy
+sphinx==6.1.3
+sphinx-notfound-page
+sphinx-tabs
+sphinx_copybutton
+sphinx_markdown_tables
+sphinxcontrib-jquery
+tabulate
diff --git a/requirements/mminstall.txt b/requirements/mminstall.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3ea0d12422b7cfc77acd3ad715b8fe434e17e014
--- /dev/null
+++ b/requirements/mminstall.txt
@@ -0,0 +1,2 @@
+mmcv>=2.0.0rc4,<2.2.0
+mmengine>=0.7.1,<1.0.0
diff --git a/requirements/multimodal.txt b/requirements/multimodal.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c236cced2b3f48a7b8d5522c18d34294cddb0ba6
--- /dev/null
+++ b/requirements/multimodal.txt
@@ -0,0 +1 @@
+transformers>=4.28.0
diff --git a/requirements/optional.txt b/requirements/optional.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8a88a10cfe07a3186a8a5ce7320599fbcd806722
--- /dev/null
+++ b/requirements/optional.txt
@@ -0,0 +1,13 @@
+av>=9.0
+future
+imgaug
+librosa
+lmdb
+moviepy
+openai-clip
+packaging
+pims
+PyTurboJPEG
+soundfile
+tensorboard
+wandb
diff --git a/requirements/readthedocs.txt b/requirements/readthedocs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..448ab5abed9c1a13e4b2a423ee710539e1a4eec6
--- /dev/null
+++ b/requirements/readthedocs.txt
@@ -0,0 +1,4 @@
+mmcv
+titlecase
+torch
+torchvision
diff --git a/requirements/tests.txt b/requirements/tests.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0ca5d863987fcffdea81833565116d8e8703cd4b
--- /dev/null
+++ b/requirements/tests.txt
@@ -0,0 +1,9 @@
+coverage
+flake8
+interrogate
+isort==4.3.21
+parameterized
+pytest
+pytest-runner
+xdoctest >= 0.10.0
+yapf
diff --git a/resources/acc_curve.png b/resources/acc_curve.png
new file mode 100644
index 0000000000000000000000000000000000000000..27a2f0851e7d9ee0c912f73af947b11453422988
Binary files /dev/null and b/resources/acc_curve.png differ
diff --git a/resources/data_pipeline.png b/resources/data_pipeline.png
new file mode 100644
index 0000000000000000000000000000000000000000..1c73cd638649b27c071dc828b1341fd151de293c
--- /dev/null
+++ b/resources/data_pipeline.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21610750aefe62edee36272eb8321f7fcbca95c717c7b5dfb86b846428c78a54
+size 117332
diff --git a/resources/miaomiao_qrcode.jpg b/resources/miaomiao_qrcode.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f5348e21ea6b398cd5a1cd621ce58f4c9a08e300
--- /dev/null
+++ b/resources/miaomiao_qrcode.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76f9ac900623cc0e3e68c5fee382f78901800b5c9d84493afc03418e94dce018
+size 225737
diff --git a/resources/mmaction2_logo.png b/resources/mmaction2_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..f0c759bb78c5424b4394d18a5ba833a8c9f43add
Binary files /dev/null and b/resources/mmaction2_logo.png differ
diff --git a/resources/mmaction2_overview.gif b/resources/mmaction2_overview.gif
new file mode 100644
index 0000000000000000000000000000000000000000..123a65cff421325c740c0ef089b7bb659e4a555a
--- /dev/null
+++ b/resources/mmaction2_overview.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b6b778f476ee46d3e136ad0bd596ae6b2c76be37a6edb6214f06f1edca02884
+size 1701421
diff --git a/resources/qq_group_qrcode.jpg b/resources/qq_group_qrcode.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cfd399858cac8bd164cf172140a76d8c8a7b8bf2
--- /dev/null
+++ b/resources/qq_group_qrcode.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7afbe414bbdfb299d0efec06baf4f21d9121897f338f8d6684592e215e9e7317
+size 204806
diff --git a/resources/spatio-temporal-det.gif b/resources/spatio-temporal-det.gif
new file mode 100644
index 0000000000000000000000000000000000000000..ce134cdb949e3f0fd9b41343d3003770cc82948e
--- /dev/null
+++ b/resources/spatio-temporal-det.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f77a76b06d3b72373da76aaa53ff8b87a77e9a021390137d3116d44b3a1bf637
+size 1302833
diff --git a/resources/zhihu_qrcode.jpg b/resources/zhihu_qrcode.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f791e858c942e8d4da3098e8d18a687b7eca6f73
--- /dev/null
+++ b/resources/zhihu_qrcode.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:171db0200db2735325ab96a5aa6955343852c12af90dc79c9ae36f73694611c7
+size 397245
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..f22ad55baf0ada2db8fcfbfc59dcadf2849775ea
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,25 @@
+[bdist_wheel]
+universal=1
+
+[aliases]
+test=pytest
+
+[yapf]
+based_on_style = pep8
+blank_line_before_nested_class_or_def = true
+split_before_expression_after_opening_paren = true
+split_penalty_import_names=0
+SPLIT_PENALTY_AFTER_OPENING_BRACKET=800
+
+[isort]
+line_length = 79
+multi_line_output = 0
+extra_standard_library = pkg_resources,setuptools
+known_first_party = mmaction
+known_third_party = cv2,decord,einops,joblib,matplotlib,mmcv,numpy,pandas,pytest,pytorch_sphinx_theme,scipy,seaborn,titlecase,torch,webcolors
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
+
+[flake8]
+per-file-ignores =
+    mmaction/configs/*:F401,F403,F405
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d2c59c0b21ec28f672e15559c2c2c8a2dca9e71
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+"""
+Setup script for GenVidBench on Hugging Face Spaces
+"""
+
+import os
+import subprocess
+import sys
+
+def run_command(cmd):
+    """Run a command and return success status"""
+    try:
+        result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
+        print(f"✅ {cmd}")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"❌ {cmd}")
+        print(f"Error: {e.stderr}")
+        return False
+
+def main():
+    """Setup the environment for Hugging Face Spaces"""
+    print("🚀 Setting up GenVidBench for Hugging Face Spaces...")
+    
+    # Create necessary directories
+    os.makedirs('checkpoints', exist_ok=True)
+    os.makedirs('tools/data/kinetics', exist_ok=True)
+    
+    # Download model checkpoint
+    print("📥 Downloading model checkpoint...")
+    if not run_command("python download_model.py"):
+        print("⚠️  Model download failed. Please check your internet connection.")
+        return False
+    
+    print("✅ Setup completed successfully!")
+    return True
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/test_app.py b/test_app.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2904e363e9764e8aa2d0eda475d0717afc5e458
--- /dev/null
+++ b/test_app.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+Test script for GenVidBench app
+"""
+
+import os
+import sys
+
+def test_imports():
+    """Test if all required modules can be imported"""
+    print("🧪 Testing imports...")
+    
+    try:
+        import torch
+        print("✅ PyTorch imported successfully")
+    except ImportError as e:
+        print(f"❌ PyTorch import failed: {e}")
+        return False
+    
+    try:
+        import gradio as gr
+        print("✅ Gradio imported successfully")
+    except ImportError as e:
+        print(f"❌ Gradio import failed: {e}")
+        return False
+    
+    try:
+        from mmaction.apis import init_recognizer, inference_recognizer
+        print("✅ MMAction2 APIs imported successfully")
+    except ImportError as e:
+        print(f"❌ MMAction2 import failed: {e}")
+        return False
+    
+    return True
+
+def test_files():
+    """Test if required files exist"""
+    print("🧪 Testing file structure...")
+    
+    required_files = [
+        'app.py',
+        'requirements.txt',
+        'README.md',
+        'demo/demo_configs/tsn_r50_1x1x8_video_infer.py'
+    ]
+    
+    for file_path in required_files:
+        if os.path.exists(file_path):
+            print(f"✅ {file_path} exists")
+        else:
+            print(f"❌ {file_path} missing")
+            return False
+    
+    return True
+
+def test_model_checkpoint():
+    """Test if model checkpoint exists or can be downloaded"""
+    print("🧪 Testing model checkpoint...")
+    
+    checkpoint_path = 'checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth'
+    
+    if os.path.exists(checkpoint_path):
+        print(f"✅ Model checkpoint found: {checkpoint_path}")
+        return True
+    else:
+        print(f"⚠️  Model checkpoint not found: {checkpoint_path}")
+        print("   Run 'python download_model.py' to download it")
+        return False
+
+def main():
+    """Run all tests"""
+    print("🚀 Testing GenVidBench setup...")
+    print("=" * 50)
+    
+    tests = [
+        ("Import Test", test_imports),
+        ("File Structure Test", test_files),
+        ("Model Checkpoint Test", test_model_checkpoint)
+    ]
+    
+    results = []
+    for test_name, test_func in tests:
+        print(f"\n📋 {test_name}")
+        print("-" * 30)
+        result = test_func()
+        results.append((test_name, result))
+    
+    print("\n" + "=" * 50)
+    print("📊 Test Results Summary:")
+    print("=" * 50)
+    
+    all_passed = True
+    for test_name, result in results:
+        status = "✅ PASS" if result else "❌ FAIL"
+        print(f"{status} {test_name}")
+        if not result:
+            all_passed = False
+    
+    if all_passed:
+        print("\n🎉 All tests passed! Your app is ready for deployment.")
+    else:
+        print("\n⚠️  Some tests failed. Please fix the issues before deploying.")
+    
+    return all_passed
+
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)
diff --git a/tests/apis/test_inference.py b/tests/apis/test_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ff627f694a7fb3ba7c0d339f699f83913a10a5a
--- /dev/null
+++ b/tests/apis/test_inference.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from unittest import TestCase
+
+import torch
+from mmengine.testing import assert_dict_has_keys
+from parameterized import parameterized
+
+from mmaction.apis import (detection_inference, inference_recognizer,
+                           init_recognizer, pose_inference)
+from mmaction.structures import ActionDataSample
+from mmaction.utils import frame_extract, get_str_type
+
+
+class TestInference(TestCase):
+
+    @parameterized.expand([(('configs/recognition/tsn/'
+                             'tsn_imagenet-pretrained-r50_8xb32-'
+                             '1x1x3-100e_kinetics400-rgb.py'), ('cpu', 'cuda'))
+                           ])
+    def test_init_recognizer(self, config, devices):
+        project_dir = osp.abspath(osp.dirname(osp.dirname(__file__)))
+        project_dir = osp.join(project_dir, '..')
+        config_file = osp.join(project_dir, config)
+
+        for device in devices:
+            if device == 'cuda' and not torch.cuda.is_available():
+                # Skip the test if cuda is required but unavailable
+                continue
+
+            # test `init_recognizer` with str path
+            _ = init_recognizer(config_file, device=device)
+
+            # test `init_recognizer` with :obj:`Path`
+            _ = init_recognizer(Path(config_file), device=device)
+
+            # test `init_recognizer` with undesirable type
+            with self.assertRaisesRegex(
+                    TypeError, 'config must be a filename or Config object'):
+                config_list = [config_file]
+                _ = init_recognizer(config_list)
+
+    @parameterized.expand([(('configs/recognition/tsn/'
+                             'tsn_imagenet-pretrained-r50_8xb32-'
+                             '1x1x3-100e_kinetics400-rgb.py'), 'demo/demo.mp4',
+                            ('cpu', 'cuda'))])
+    def test_inference_recognizer(self, config, video_path, devices):
+        project_dir = osp.abspath(osp.dirname(osp.dirname(__file__)))
+        project_dir = osp.join(project_dir, '..')
+        config_file = osp.join(project_dir, config)
+        video_path = osp.join(project_dir, video_path)
+
+        for device in devices:
+            if device == 'cuda' and not torch.cuda.is_available():
+                # Skip the test if cuda is required but unavailable
+                continue
+            model = init_recognizer(config_file, device=device)
+
+            for ops in model.cfg.test_pipeline:
+                if get_str_type(ops['type']) in ('TenCrop', 'ThreeCrop'):
+                    # Use CenterCrop to reduce memory in order to pass CI
+                    ops['type'] = 'CenterCrop'
+
+            result = inference_recognizer(model, video_path)
+
+            self.assertIsInstance(result, ActionDataSample)
+            self.assertTrue(result.pred_score.shape, (400, ))
+
+    def test_detection_inference(self):
+        from mmdet.apis import init_detector
+        from mmdet.structures import DetDataSample
+
+        for device in ('cpu', 'cuda'):
+            if device == 'cuda' and not torch.cuda.is_available():
+                # Skip the test if cuda is required but unavailable
+                continue
+            project_dir = osp.abspath(osp.dirname(osp.dirname(__file__)))
+            project_dir = osp.join(project_dir, '..')
+            det_config = 'demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py'  # noqa: E501
+            det_ckpt = 'http://download.openmmlab.com/mmdetection/' \
+                       'v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'  # noqa: E501
+            video_path = 'demo/demo_skeleton.mp4'
+            video_path = osp.join(project_dir, video_path)
+            config_file = osp.join(project_dir, det_config)
+            with TemporaryDirectory() as tmpdir:
+                frm_paths, _ = frame_extract(video_path, out_dir=tmpdir)
+                # skip remaining frames to speed up ut
+                frm_paths = frm_paths[:10]
+                results, data_samples = detection_inference(
+                    config_file, det_ckpt, frm_paths, device=device)
+                self.assertTrue(results[0].shape, (4, ))
+                self.assertIsInstance(data_samples[0], DetDataSample)
+                # test with_score
+                results, data_samples = detection_inference(
+                    config_file,
+                    det_ckpt,
+                    frm_paths,
+                    with_score=True,
+                    device=device)
+                self.assertTrue(results[0].shape, (5, ))
+                # test inference with model object
+                model = init_detector(
+                    config=det_config, checkpoint=det_ckpt, device=device)
+                results, data_samples = detection_inference(
+                    model, None, frm_paths, device=device)
+                self.assertTrue(results[0].shape, (4, ))
+                self.assertIsInstance(data_samples[0], DetDataSample)
+
+    def test_pose_inference(self):
+        from mmpose.apis import init_model
+        from mmpose.structures import PoseDataSample
+
+        for device in ('cpu', 'cuda'):
+            if device == 'cuda' and not torch.cuda.is_available():
+                # Skip the test if cuda is required but unavailable
+                continue
+            project_dir = osp.abspath(osp.dirname(osp.dirname(__file__)))
+            project_dir = osp.join(project_dir, '..')
+            det_config = 'demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py'  # noqa: E501
+            det_ckpt = 'http://download.openmmlab.com/mmdetection/' \
+                       'v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'  # noqa: E501
+            pose_config = 'demo/demo_configs/' \
+                          'td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py'
+            pose_ckpt = 'https://download.openmmlab.com/mmpose/top_down/' \
+                        'hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth'
+            video_path = 'demo/demo_skeleton.mp4'
+            video_path = osp.join(project_dir, video_path)
+            pose_config = osp.join(project_dir, pose_config)
+            with TemporaryDirectory() as tmpdir:
+                frm_paths, _ = frame_extract(video_path, out_dir=tmpdir)
+                # skip remaining frames to speed up ut
+                frm_paths = frm_paths[:10]
+                det_results, _ = detection_inference(
+                    det_config, det_ckpt, frm_paths, device=device)
+
+                results, data_samples = pose_inference(
+                    pose_config,
+                    pose_ckpt,
+                    frm_paths,
+                    det_results,
+                    device=device)
+                assert_dict_has_keys(results[0], ('keypoints', 'bbox_scores',
+                                                  'bboxes', 'keypoint_scores'))
+                self.assertIsInstance(data_samples[0], PoseDataSample)
+
+                # test inference with model object
+                model = init_model(
+                    config=pose_config, checkpoint=pose_ckpt, device=device)
+                results, data_samples = pose_inference(
+                    model, None, frm_paths, det_results, device=device)
+                assert_dict_has_keys(results[0], ('keypoints', 'bbox_scores',
+                                                  'bboxes', 'keypoint_scores'))
+                self.assertIsInstance(data_samples[0], PoseDataSample)
diff --git a/tests/apis/test_inferencer.py b/tests/apis/test_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fabcff31f4e17dc630f6702419ba43cb9a3b38c7
--- /dev/null
+++ b/tests/apis/test_inferencer.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from tempfile import TemporaryDirectory
+from unittest import TestCase
+
+import torch
+from parameterized import parameterized
+
+from mmaction.apis import MMAction2Inferencer
+
+
+class TestMMActionInferencer(TestCase):
+
+    def test_init_recognizer(self):
+        # Initialzied by alias
+        _ = MMAction2Inferencer(rec='tsn')
+
+        # Initialzied by config
+        _ = MMAction2Inferencer(
+            rec='tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb'
+        )  # noqa: E501
+
+        with self.assertRaisesRegex(ValueError,
+                                    'rec algorithm should provided.'):
+            _ = MMAction2Inferencer()
+
+    @parameterized.expand([
+        (('tsn'), ('tools/data/kinetics/label_map_k400.txt'),
+         ('demo/demo.mp4'), ('cpu', 'cuda'))
+    ])
+    def test_infer_recognizer(self, config, label_file, video_path, devices):
+        with TemporaryDirectory() as tmp_dir:
+            for device in devices:
+                if device == 'cuda' and not torch.cuda.is_available():
+                    # Skip the test if cuda is required but unavailable
+                    continue
+
+                # test video file input and return datasample
+                inferencer = MMAction2Inferencer(
+                    config, label_file=label_file, device=device)
+                results = inferencer(video_path, vid_out_dir=tmp_dir)
+                self.assertIn('predictions', results)
+                self.assertIn('visualization', results)
+                assert osp.exists(osp.join(tmp_dir, osp.basename(video_path)))
+
+                results = inferencer(
+                    video_path, vid_out_dir=tmp_dir, out_type='gif')
+                self.assertIsInstance(results['predictions'][0], dict)
+                assert osp.exists(
+                    osp.join(tmp_dir,
+                             osp.basename(video_path).replace('mp4', 'gif')))
+
+                # test np.ndarray input
+                inferencer = MMAction2Inferencer(
+                    config,
+                    label_file=label_file,
+                    device=device,
+                    input_format='array')
+                import decord
+                import numpy as np
+                video = decord.VideoReader(video_path)
+                frames = [x.asnumpy()[..., ::-1] for x in video]
+                frames = np.stack(frames)
+                inferencer(frames, vid_out_dir=tmp_dir)
+                assert osp.exists(osp.join(tmp_dir, '00000000.mp4'))
diff --git a/tests/data/activitynet_features/v_test1.csv b/tests/data/activitynet_features/v_test1.csv
new file mode 100644
index 0000000000000000000000000000000000000000..49b38d637091e2289e932e034dc03edc76a48734
--- /dev/null
+++ b/tests/data/activitynet_features/v_test1.csv
@@ -0,0 +1,6 @@
+f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,f61,f62,f63,f64,f65,f66,f67,f68,f69,f70,f71,f72,f73,f74,f75,f76,f77,f78,f79,f80,f81,f82,f83,f84,f85,f86,f87,f88,f89,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99,f100,f101,f102,f103,f104,f105,f106,f107,f108,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118,f119,f120,f121,f122,f123,f124,f125,f126,f127,f128,f129,f130,f131,f132,f133,f134,f135,f136,f137,f138,f139,f140,f141,f142,f143,f144,f145,f146,f147,f148,f149,f150,f151,f152,f153,f154,f155,f156,f157,f158,f159,f160,f161,f162,f163,f164,f165,f166,f167,f168,f169,f170,f171,f172,f173,f174,f175,f176,f177,f178,f179,f180,f181,f182,f183,f184,f185,f186,f187,f188,f189,f190,f191,f192,f193,f194,f195,f196,f197,f198,f199,f200,f201,f202,f203,f204,f205,f206,f207,f208,f209,f210,f211,f212,f213,f214,f215,f216,f217,f218,f219,f220,f221,f222,f223,f224,f225,f226,f227,f228,f229,f230,f231,f232,f233,f234,f235,f236,f237,f238,f239,f240,f241,f242,f243,f244,f245,f246,f247,f248,f249,f250,f251,f252,f253,f254,f255,f256,f257,f258,f259,f260,f261,f262,f263,f264,f265,f266,f267,f268,f269,f270,f271,f272,f273,f274,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284,f285,f286,f287,f288,f289,f290,f291,f292,f293,f294,f295,f296,f297,f298,f299,f300,f301,f302,f303,f304,f305,f306,f307,f308,f309,f310,f311,f312,f313,f314,f315,f316,f317,f318,f319,f320,f321,f322,f323,f324,f325,f326,f327,f328,f329,f330,f331,f332,f333,f334,f335,f336,f337,f338,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348,f349,f350,f351,f352,f353,f354,f355,f356,f357,f358,f359,f360,f361,f362,f363,f364,f365,f366,f367,f368,f369,f370,f371,f372,f373,f374,f375,f376,f377,f378,f379,f380,f381,f382,f383,f384,f385,f386,f387,f388,f389,f390,f391,f392,f393,f394,f395,f396,f397,f398,f399
+-2.52400826749,0.0481050342173,-0.727137195971,2.75537272315,3.09127621822,-1.57007092339,-0.418208286763,0.0913230466118,-0.536148328353,-0.527615223662,1.09348152733,-0.740857539139,1.03076939449,0.947990020203,-0.00932133916349,0.546988826083,-0.737920381243,0.823520260094,-1.44379751155,1.67705288164,1.85386635752,0.62453161102,1.13374109944,-0.161873651211,1.40335457467,0.267813141882,1.40327533282,0.143771933515,-0.29447495679,0.779869429758,-1.38585145822,-0.361671252653,-1.46679541523,0.0859254586217,0.266080879981,-0.680839165484,-0.774731957742,-0.618207527285,1.57201336054,0.875829197772,-0.896498123858,-2.55398872891,-0.796735937603,-0.338483746318,0.511324636391,-1.21437529424,-0.0488620607446,0.253289302886,2.71006785221,-0.573161459164,-0.341657902954,-0.854258292083,0.562081610284,-0.828878082845,2.00327134909,1.29068322546,-0.418051389774,1.14570354001,1.39098484308,-1.13415579068,-1.01751984858,-0.823485884605,0.354335798556,1.79059040272,0.609877418462,-1.01807533199,1.56390048495,1.00308338848,0.226345738051,-0.145077751076,0.0986133282503,-0.0274079232177,0.0618308794267,2.33058959297,0.0527062771437,-1.11440070055,-2.85928208684,2.15750540841,0.866524370256,-0.999664886812,0.65322760642,-1.01907039308,-0.827862563442,0.702348045951,-0.266591888881,-0.51787754913,-0.87550654118,-1.08840756221,-0.330164993751,-0.885034718769,-1.09602854198,-1.90739000514,-1.41201400125,3.55564525741,2.24864990051,1.85192671744,-0.886962869481,-0.706411036437,0.962288821262,-1.30219301658,0.0603706527015,-0.672105670826,-0.147220359933,-1.00931681574,-1.34130794644,-0.0213488208036,-0.965187689045,0.427090878957,-0.922304333641,-1.13947635577,0.637382086489,-1.706998011,0.00132625548269,0.663770250584,1.58249601114,-1.04340366269,0.375227416108,-0.0870821477482,0.551722806776,0.588611513848,-0.477017772079,-1.51536188044,0.237936462599,0.31261506067,-0.198127712396,-0.318572429209,-1.18890325315,0.035582087437,2.67528950232,-0.197889107378,1.55762961412,0.104639883842,-1.66993450781,0.702282006582,1.36717389178,0.634535223722,2.85315937821,-1.27367064913,0.483830422936,-0.869812565212,0.641265734616,-0.11914733068,1.0239396073,-3.92902142357,0.694317328488,1.34085481986,-0.135329176331,0.0261293066915,-0.303456270416,0.909167548313,-2.04735304332,-0.285427697695,-1.03457319064,-2.77420531572,0.197031497599,-0.520362589547,-1.37924786457,-0.418569629841,1.54322130788,1.83725603097,3.35605137842,-0.117215889143,-0.970470848036,-0.339063598965,1.57921290781,0.196319119013,-1.22568776573,-0.448961007657,0.609897182756,-0.168152849526,0.254480323573,-0.51589471003,-0.253088873187,-0.716572365129,-1.56268640697,-3.33835895995,-0.679914745818,0.107016925667,-1.61204098026,-0.387739681651,2.40210230323,-1.0956975287,-1.72501473746,-0.766200882827,0.752211827669,1.55532805525,0.113983938016,4.54239864121,-1.36827292666,-1.88835217549,1.40817465219,0.708602657522,1.31514883588,0.0314930005956,-0.79571607963,0.75615035674,1.14977174081,-1.72166323668,0.565034879125,-1.41448308724,-1.57710396359,-1.17078288789,1.1485206762,0.393694747107,1.20387821507,0.699366232003,1.80047030851,1.42655580688,-1.41627641805,-0.0899006423315,1.0611155262,-1.131250839,2.23898952868,-3.58230877813,-0.889216990584,1.40956827182,-1.46751403757,-0.691296854089,-1.54265676827,2.65262625498,2.19788404633,-2.01697903653,0.611521417417,0.359316692791,4.6816105414,0.862952723244,0.167491980372,2.6932665368,-3.00625465314,-0.351348050268,-0.89827277051,1.1813078626,-0.683418750015,0.612255702038,1.80744153164,0.0561640557506,-1.55411351133,0.711329718813,-3.72017506799,0.381065155569,-0.414420442519,-1.60570235569,-0.599320146458,1.05618929973,-1.47036342112,1.14814616981,-0.245414197276,-1.86036272008,2.96957122081,-1.61679375941,-0.50189343957,3.2102935297,3.52676818145,3.37559696234,1.65133903096,1.07003903059,0.246458431642,-2.86996585644,2.9472088513,0.156860758686,2.65348488352,-1.65249707957,-1.10731408448,1.62994935577,-1.96909845304,-1.9090510476,2.51069158859,-1.65984114813,0.148115664273,1.10611308391,1.18241718985,-4.85953441229,-1.0049765752,3.88280249662,-1.75265659238,0.372608524032,-2.22002927662,1.18168715581,-2.87508345833,-0.676288569625,-2.44675108062,-1.55716385372,-1.62059798953,0.724381881496,-0.960783561886,-0.552230426264,0.121615798579,1.04462357852,0.118085120237,1.26606201262,-0.380661477003,-2.58578204132,4.03374155601,-2.25326988394,-2.88061044978,3.26819336615,1.91267201179,-0.19674664532,2.05710699236,-3.54867236793,-0.326269919106,0.752888089223,0.132116086772,-1.54644230279,-2.836589684,0.141382075407,-1.44156945706,1.19807019893,1.68431397116,0.438746488152,-2.06834516275,-0.842738093366,0.465043608979,-0.629041527666,-0.0120976683258,-3.00099798249,-1.73881566772,0.881273090875,-0.540746588847,-0.38645376593,-2.43880278615,-0.563591295604,1.477140512,-1.75295748363,1.76406287775,2.66264589914,0.484454554128,0.273973214982,-2.05206947308,-0.369256326252,-0.689306857174,1.66270560488,-0.131857610115,0.955091272134,-1.60116198558,-2.28544168464,2.11164102397,-4.18991734267,0.173959671197,-0.0354114097397,-1.4089728089,-0.311132524,1.89336391541,2.43192427419,1.01858890895,2.03606205304,1.62452822335,3.64225894583,2.28056802496,5.64531833088,-1.1566376147,2.07540663589,0.620578413989,0.750977221371,0.0162535885321,-2.16207619048,-0.105952032448,-0.117025236938,-2.50755272675,1.48142693144,-0.430885550216,2.23543980132,-0.326485130108,0.0243268507167,2.06152002688,-1.02234084951,-2.0303752323,0.561301589735,2.3433107876,-0.925805005171,2.80904484078,-1.94807647011,0.329007639042,0.397634451785,1.47111085828,-2.50084066219,1.09999789629,-2.99330297808,-0.0599839422321,-1.9690194292,0.960052060426,-2.19808352939,-2.01816409011,-5.65800942077,-0.0169289777679,1.16420775694,0.723551353918,0.643957264021,-0.140148446853,-0.056547111384,1.91572655252,-1.37543404733,0.484043939791,2.79265339713,-1.17311209973,-0.371278463653,0.469582405128,-2.31444814128,1.41635027072,-1.07100369346
+-4.16998558362,2.12975610028,-2.56134395649,7.28089529038,5.71112143199,-1.43967841108,-2.27770995537,-0.621412650546,-1.44766437213,-2.65973161459,1.36775091092,-0.475116016803,-0.587382383942,4.81157625596,0.770176066954,0.363275742132,-0.0876347057022,-0.475521533538,-0.0547252563637,4.64327842236,3.68908154567,2.63090462903,4.96261648734,-2.3996240147,0.249490239721,1.12136919369,2.95945439398,-1.5711039712,2.68638911406,0.584886546134,-2.50314228614,-2.72285134157,0.61815967679,-1.74822253416,-0.311564020118,-2.74809125702,1.47346679886,-3.40588476142,1.47545339028,3.02455658674,-3.94506848613,-4.14376579285,-1.73336583535,-2.40840473334,-2.22219073812,-5.15251653036,0.988312865494,1.78566960146,6.54388860067,-1.45725802938,0.214708279868,-2.72405630668,2.83319289843,-1.85521226009,3.58616267999,3.34310981591,1.02165599783,3.42570413748,0.846149519881,-2.93276470105,-1.80281494916,-4.22263733625,-1.52749340316,3.2283666563,4.42827975909,-1.44139790932,1.73660321256,1.17811784268,4.59021838108,1.89355262021,0.455512814919,1.27808425168,1.62865997315,6.70429563522,-0.847455751549,-5.35004572391,-5.12095170339,6.48116056124,0.300556570692,-5.01764505545,-0.875816748044,-1.82039844963,-1.25923923691,0.632047503791,2.15801657677,-2.92180285851,0.511598025958,-2.96027669827,0.547309962512,-2.98510901829,-0.335630682309,-4.73974208434,-2.01421547413,3.362338895,5.79285810471,9.42033552887,-2.91738398632,1.82035643975,1.98379708379,-2.70420178073,-1.48058941424,1.56434452216,-0.992579338154,2.37859466165,-3.72032371362,1.26282515267,-3.50253353516,0.00376921892301,1.18962185065,-1.0557041204,0.54337829232,-1.99295026461,2.62920855999,3.76263545752,1.2841622142,-2.72069926341,-1.80479015474,1.58534218073,2.60577425917,-0.440677909057,2.20203198473,-3.39447330793,2.79975073894,2.23906295717,0.677189537287,1.39489221702,-0.518861652811,-1.19545238594,5.21395279209,2.14497482498,3.99990809123,-1.70296090881,-2.09669830044,-0.502894639969,3.01051452478,1.25882732471,-1.28701953888,-3.64675308704,0.679585470159,-3.88040889422,0.100971349178,-3.87473366777,8.57528485777,-7.33635827383,0.620873548189,4.256256452,-3.20197622975,0.181273331641,-1.08387027582,4.7040402782,-4.30957582315,-3.2032131656,-3.55255149682,-5.39665594737,-1.43142532587,-1.0020887959,0.310152183772,-4.9755616792,0.544686280489,3.23141360442,3.48532564084,-2.27912784214,-5.4400074927,-2.9422715648,5.55690115452,1.07856818487,-2.60423706293,0.296417542696,0.018438497484,-1.6693427813,-1.97826829297,-0.649584023059,1.0299335142,-1.30126957735,-1.49028243661,-7.05598390897,1.53666977635,2.47103852113,0.548410004575,-2.33345104297,1.05941242347,-2.22456861824,-0.833920312524,0.616063261429,1.08299628615,4.64962686857,-0.85913300693,8.38019424758,-3.35722782453,-5.88692650636,2.48297270139,-1.82296590428,-1.72441059232,-3.50540684352,-4.86662904103,1.4669711864,4.01910547892,-0.666310483219,1.94299481273,-1.65633018176,-0.233463008008,2.92032059917,-3.11237916489,1.65681514025,-5.82044394652,-0.84150699973,5.2420919474,1.65209466338,5.1169664971,2.8554833293,2.7991078945,1.85252228816,-1.80552712282,0.913601561388,0.441482040088,-0.160765804846,1.5659571287,-5.15831661542,1.85946914524,4.30885611724,2.5515617756,4.66296468178,6.40177754471,0.323659792742,2.79168056408,-2.54396620949,2.11927359978,3.5409553499,0.143619238635,0.247531717618,-3.67236700398,0.0737643596032,6.4369303449,-4.20339368939,1.39238156477,-0.479590680996,1.23359161367,1.11356295109,-0.530017747878,2.8127275755,1.67139578978,-0.648806054595,-3.56483347257,-0.00777567660002,-4.97657731056,2.76010027647,2.79106523007,-2.92366722226,-0.381967118582,-8.20272569498,-1.22538543622,-0.975923561257,-1.2079847001,5.68413191756,-0.519274702668,1.34021991417,0.46834429979,-0.752738639987,-4.23064642449,-6.19847359916,1.9824349012,-5.77588344375,-6.11922142108,3.66428396702,7.66924429814,-0.776042481264,7.10654588699,-0.732527501781,2.01595049262,-0.872191261451,2.67919575771,2.4503210032,-2.90921337763,8.53517298381,0.212812230588,0.476091645162,0.748127258619,-0.886277671655,-2.89118565341,0.142637886207,-3.79416944186,1.11709731897,-1.30126662016,0.359220613638,-2.86900741637,-4.63997180067,1.53915568789,-4.55603598674,-2.03369594216,-1.81275931041,-2.69728669763,-2.77373948296,-0.780138870872,-0.710413366953,-1.87378830453,2.78039755662,-3.32990742207,3.18837203344,-1.00930721204,-4.34471332073,-2.7804454573,1.49880246004,1.22752761165,-1.44689382633,1.45333088478,4.27367163022,1.721656696,-3.6055589668,3.01899054011,7.5569880708,-2.61906720797,-2.57271003584,2.80881048858,-0.415334333976,-3.0628209281,-3.63716221015,-0.194801000356,2.79870586514,2.79689924727,0.0788984746723,-1.96187414487,-2.75171196282,-2.28218094111,0.444554739001,4.8369281887,0.373838265736,-3.15276482065,4.03460666657,-1.86244435867,0.253326237999,-0.800799566707,-1.74990467469,-2.74444140275,-5.73288337012,-4.91918236891,-0.418412837584,-2.99338801781,-1.38950726748,1.11461923277,5.90281201998,-0.707580384415,-2.67438790878,4.21448961059,0.828290172268,1.15630444248,2.80011883676,2.65575761526,0.483185992143,1.03626998862,0.131995103361,-2.91395613949,-1.43565141161,-2.69984012683,-0.626701895692,3.98586324195,-2.19652486801,-2.48867563566,-1.19348388483,2.79217995802,-0.750475711823,-0.945274029968,-0.126381392279,-3.6633948501,-1.54844618718,1.36196402073,0.468697243529,1.29018088311,0.94496485432,0.257892522415,-5.15796130657,-1.53281098127,0.595785883914,-0.833150585492,2.10806567272,5.13338648002,0.01430302143,1.24969169378,0.00611201127369,1.25787633081,-0.926280161539,2.16456234137,2.116730539,4.47622630279,2.12537882169,0.520683592956,-1.542467405,6.23520137549,-1.31958263814,0.309113717082,-1.16410690943,2.81666246732,1.45756631712,-5.58640872558,-0.689133227666,-1.21494281928,-2.40350431559,-2.07186533292,-4.34414368868,-0.898425387144,-2.84011162599
+-2.85525532881,4.14924573302,-1.27022984872,4.43080223083,1.04979521433,-1.7563615183,-1.1571517543,0.443647010723,-0.840120493175,-0.564384366473,-0.631840480766,0.532262438599,0.584832645258,3.23352189611,3.05675490737,2.79432141225,-1.4358461082,0.0141486930853,0.928806241353,4.37966580232,2.8490308106,0.783738804857,3.78208962361,-2.80982620994,2.02718123476,0.447202665606,2.01867037753,0.748949680329,0.626896452109,-0.226885780966,-2.62637141645,-4.79518300573,0.517160896062,-0.495881884893,0.551008209387,-1.1999056525,1.58518931756,0.092337232629,-1.19481320501,2.92050409516,0.70208245794,-1.14886969738,0.497751923401,-0.698487961093,-1.87117256582,-1.65841737827,3.39620117505,3.17374242703,3.50091727654,0.480773175558,1.40684746265,-3.40429907004,0.423096078237,-1.25402658423,1.40384977142,2.23528889895,-0.70792874376,3.44265838623,-0.298643459876,-2.92092214823,-0.387096325756,-3.39548440655,-2.21305868606,4.01884763082,2.1962247467,0.178924582303,-0.175330102443,-1.81287087758,4.0013677895,0.506375047565,0.164289975565,2.65211846734,1.90428843131,4.45052925507,0.60681405703,-2.01008831143,-1.829990381,3.47248803615,-1.04316819509,-2.40825766305,-2.3010283341,1.26562317558,1.44828870733,0.254433333177,-0.294035871825,-2.39190562248,1.16849062324,-2.10750372112,-0.213768513898,-4.53380696336,-2.05353827099,-6.3679600064,-3.59502876282,-0.357480708757,3.44140817722,7.012797233,-1.16484250784,-0.17219096899,-1.65201326678,-3.91428116242,-1.39317485134,-1.78935467323,-2.13693570018,1.49206449827,-1.47030715466,0.326555347044,-2.8691151468,0.987859331371,-0.0670162276435,-1.38699082017,1.38502636115,-0.891648494402,1.63707906797,0.654039901097,0.315870566068,-1.13308484296,-1.63928325141,-0.569100450525,1.42651925405,-0.627428011101,0.216225209237,-1.25899307927,0.828946293494,0.974174125592,-0.332280605535,2.90402588169,-1.104502304,-0.644741526048,-1.07491079171,0.416999756893,1.47221087893,-3.26141314586,-2.26964950522,-0.0280790646872,3.24086038212,1.20009862085,-1.75527016382,-0.539535063108,3.23909044464,-2.99914438327,-0.492613923551,-2.91626054168,7.31597944102,-1.64774904013,-0.73017560184,0.442671738662,0.283633226553,0.714817404846,-1.79878552278,5.11262804588,-4.30506066322,-3.61411044379,-3.82477523089,-2.89008922736,-1.73692337195,-0.71265813748,0.314715143045,-3.16757190545,3.47336832523,1.5834569327,-0.637929768363,-1.56214804153,-2.64970105807,-1.12900751829,3.98810140292,1.87983502666,-2.51413838069,-0.909131198054,2.46703845749,-1.16912671606,-0.352692016586,2.6085906589,0.711290110747,1.82539761384,0.137608984311,-4.09530947288,1.01127222915,1.98808420658,0.725776154994,0.456542024016,2.36024162223,-1.51671710104,0.909857604951,1.3748901693,1.41866263221,2.22546428785,-0.842200076581,3.517446268,-1.94564609289,-2.96543750087,3.66959119841,-4.30324907561,-3.19456482887,-2.38057807227,-4.43179172357,-0.982803171277,-0.41006461223,0.280178608544,1.95349114498,0.637461675009,0.711961734593,1.80234276384,-1.78083568494,0.520603844326,-2.37248194615,0.146621232829,1.95268532594,1.55047165434,0.825010337035,2.16551250696,0.958925328056,-1.03714228699,0.654975053468,-3.01727262656,0.247705178956,-0.0690905296781,-0.235510739784,-3.40891237398,1.3884248968,1.15451488764,2.64650440057,0.807570249241,2.08921063463,0.508586264452,2.52009829918,-1.11128878554,1.39935349762,1.06951609214,-0.485668144226,-0.460008237761,-1.70877252301,0.942621914198,3.41737226328,-2.40122259855,1.40087889274,1.62360543887,-1.58665239096,1.05352225239,-1.45161462784,0.468765456079,1.15845116933,-0.269039389293,-1.64486767074,1.02112615665,-3.15314137697,1.83668091496,-0.21584566613,-3.70026185195,-0.418916064101,-3.95508877378,-2.58916404287,-0.282405416965,0.0237940859794,1.56997692525,1.15945299725,1.77722654502,2.98457802137,-1.70026101914,-1.18428363204,-3.13462997307,2.47967257818,-3.06139141003,-3.33533022483,1.78348285884,3.65876099269,-0.542083423932,4.338555275,0.646300950845,2.75761772871,-1.33789882819,3.41355988423,-2.1038232104,-1.58832200845,4.30315493663,-0.497908014457,1.43125514845,1.23661852837,1.89458917022,-1.27604429007,-0.118665337562,-2.98061999162,0.96282290379,-0.317447299958,0.177331671019,0.190233225426,-2.25885749382,0.633996060689,-0.931709454854,-0.453512817619,-1.06709086379,-0.45003234585,-2.11921728969,0.742342797123,-1.2796056505,-3.18736832539,1.89475087484,0.647759524982,3.05645425161,1.20850815674,-2.71339397748,-0.888974133234,2.6798757871,0.973526877165,-3.10087224166,0.282148707707,-0.588648343086,1.1617284895,-0.947238893711,1.91763001402,2.77221791545,0.242102493444,-2.7309236304,1.19404949462,-2.29922574123,-0.496662088036,-1.43388394435,-0.541529648303,0.914798926115,-1.00208673149,0.693878029583,-1.63149386843,-1.92279982587,-2.83413622906,1.15527868609,1.48624739955,0.0722957122324,-2.01015367587,2.79194158167,-1.34159947316,0.350978424549,-0.150799014967,0.594457630018,-0.702615435521,-2.49834770679,-3.44722706755,0.724352367323,-1.91413194974,-1.50618719021,0.208274304816,2.56051458041,-3.38282206297,-2.67611726205,4.30181331436,2.60196872592,0.980345343721,4.28195017179,2.45016477822,-0.720569800933,0.134198579739,-0.29681619644,-0.620866637628,0.0668065834062,-0.820043117604,0.427079674204,1.07770038346,-1.89850125671,-0.367198590239,0.309245206813,1.49165853987,-1.93249949853,-0.770264412958,0.697864535651,-1.92503979524,0.36664308548,0.6772959585,-0.407557226819,-0.110297719638,0.0780190831417,1.13796422362,-2.93108891884,-0.108831601143,0.0333983715381,-0.582767866453,1.68451089442,1.07477574031,0.759609896341,1.02592154245,-1.07680930615,0.977406439981,-2.15689084132,0.897650267382,0.871076323589,0.485362575054,-0.271094031335,0.392738024197,-1.50007651523,4.16120113373,-0.87542103827,0.770962069035,-0.193105610213,2.63168554207,-0.0860587771735,-1.02318051895,1.64206330359,-1.97631421804,-0.459768193164,-0.987577437561,-3.05661367973,0.700944906869,-2.85832208077
+-2.40668356418,3.32200128635,-0.583146995504,5.17893602371,0.543722619215,-3.61351331234,-2.15219051798,0.154239282607,-1.86185589939,1.86499222438,0.546306239763,0.173791361054,4.68988918622,1.45787520011,4.61635592778,4.10645994823,-3.2520207723,2.82534058571,1.75578262289,1.28921755393,-2.56118538936,-0.681506864627,1.08718702157,-1.73322505633,1.85559087117,2.59411209822,4.86438429197,0.952494197489,-2.00043742815,1.56013310157,-2.24776257197,-3.37023128669,-4.3081034263,2.49645762126,0.0613088496522,1.5614004375,0.196160220802,6.71882646243,-0.515890210072,4.46806035837,6.49843154748,-1.07791967916,3.66291252851,0.340969046157,-0.717211693128,0.893422653279,4.23518612067,1.59024640679,4.00953623931,7.01554282506,3.3829888622,-5.28307714462,-2.56433442275,-1.21852455298,0.420509056251,3.97645592849,-3.46140729904,2.4203199927,0.499145697951,-3.22149805546,-0.210846113366,-1.82363392035,-0.608880066672,6.9203904438,-1.60331305107,-0.572833641767,-0.809020875096,-3.67446678479,-0.751598751347,-4.0169324255,-2.54423304001,3.43391434272,2.22814426263,0.720494257411,2.44403583368,0.126800663272,1.21261574904,2.80068611622,-1.46503902833,-1.02387386938,-2.22691595475,2.92893217762,4.35140001932,2.05282717824,-1.44687641621,-1.2482182169,-2.92161394775,-1.7117234171,-0.664106516638,-4.8541015784,-5.77170533816,-4.39334596634,-3.39425205867,2.10928462108,1.63525372922,2.20211301041,1.6979695034,-3.62859933059,-5.0955384318,-3.70584682147,0.913468626738,-7.92930506388,-5.18711395264,-2.14751714547,0.553891262807,-1.69585991979,-3.80843970299,5.93398868561,-2.32868751923,-1.18235898415,2.63725592931,-1.31388532559,0.924713171173,-3.68923300982,1.09287478288,0.447131590248,-1.02456968466,-1.82614021699,-1.27993409872,1.58124616583,-3.71338141124,2.08220694741,-2.52321253032,-1.8201927487,-0.489585822324,2.26087673823,-3.07679171085,-2.40032638788,-2.84321398576,-1.48280228813,-0.933238696854,-4.71049805482,-2.02947084586,3.95902432919,4.56408443928,2.77234577179,1.14790276547,3.24662017902,8.24014697393,-2.22661842028,2.16570036093,-1.47694238861,-1.56150964896,3.00861291885,-1.58600352287,-2.14261006952,3.36371217092,-0.277815688848,-2.55071312587,3.11163931847,-3.03255870501,-5.94063932737,-4.34915611903,-1.83065024058,0.344852973223,1.66785877029,-2.92896215598,1.02625600656,3.99294057846,-0.764026923974,-1.21331283232,1.14239682655,2.6062800312,0.555238639911,4.5995118173,4.17675596714,-4.47169959545,0.607188218708,4.99268372536,-1.10778329849,0.359094379742,2.3692166694,0.923014166752,1.39561937173,0.489449826081,-3.64099951267,1.49465563099,-0.864940508206,-3.8856684494,3.41578993161,3.80568179767,-3.16751228809,-1.90362671534,-1.0676062123,-0.827274825275,0.810656501699,-2.94211922248,3.80980886777,-0.505323204397,-2.70784498771,5.20668672403,-4.93021412532,-4.60470018069,-0.988903661569,-3.12164619764,-0.759834496776,-1.40789370815,-1.30719569206,3.67482577324,1.25514381965,0.729897277155,-0.221074349482,0.727831269502,-0.159013110398,2.35894515037,-1.60380238533,2.4536198473,-0.0437957082188,-2.46773814758,1.21704642216,-0.603128572703,-1.80407706489,3.83205666224,-6.8485059007,-0.767830495338,3.48311652978,-1.5156415077,-0.384740158121,-2.00051572681,1.33816781203,2.74709336281,-4.9876317811,-0.8754006657,1.1287828366,4.12337694327,-0.0656415704896,0.988705775539,1.21024437666,-5.10868624687,-0.0440934690972,0.0288263560326,-0.0765196786313,-2.51989612102,0.279547793863,3.3720527331,0.871332397062,-1.06302194118,3.50864712556,-5.51388967832,-2.0657237343,1.25920955737,-0.851355524064,0.682309628327,1.77832262437,0.827240066528,2.64016712666,-1.44682307978,-1.31921160618,3.49327129126,-0.484558734299,0.692844864529,1.00374541759,2.69691859166,0.154326318701,3.57687735557,2.06113112768,0.991898488825,-2.44635528803,3.95126618067,0.472989312112,2.33190206448,-1.30573364337,-0.437735764186,-0.251160595852,-4.47043835958,-1.51135720338,0.506121761999,-2.44358267824,0.0295987832554,-0.774288076561,1.33123704235,-9.26131312053,-1.16106868347,2.29522511721,-0.143810934227,1.58851175785,-0.934488321146,2.50735031327,-2.19833483537,0.610350404581,0.244342085619,-0.716844118736,2.41659238497,-1.20272970358,-0.134129219055,1.19137221933,1.4639560167,2.79779875596,0.0395902216937,-1.13805999756,-1.18215333223,-4.94711904526,2.09147545735,-2.13449596683,-5.07175304095,3.36638139486,3.70780602773,-0.945894616344,2.34982962509,-3.65934572061,1.50665946653,-1.83905771414,0.419523326158,-3.01953722636,-2.5896670572,-3.02772776922,-0.675756167273,2.18817773163,0.581919515134,-2.15692337871,-0.136594539186,-0.149565262596,-0.947531465589,-2.10921741764,2.44600348274,-0.959342634677,-1.03096477588,-0.498095233439,-2.70281470935,0.375763909419,-1.34648666104,-1.03886758149,0.246556117833,1.06395082176,1.52048031847,4.41094911337,1.58565980355,-0.538471474896,-3.59832179228,-1.84744771719,-1.98041345438,0.181751922071,-1.86992271225,2.09672110558,-3.00351278146,-2.34073953231,1.90364366372,-3.77574122826,-1.82476956447,-1.66754270395,-4.17944864114,-1.643569568,3.2956170102,4.84715448697,1.54389404712,0.413878052236,2.01489253759,3.26832122485,0.128817051054,5.05713614782,1.29822279056,3.8207182916,1.3051289777,2.15857723474,-1.16148341576,-2.10272764564,2.65485213935,3.33767395735,-0.225942747493,-0.0608929246157,0.386773107847,3.04139202913,0.880819526515,3.79432223876,1.34475161622,-1.15084494869,-2.72890689214,-2.20355211159,4.0270291551,0.831315397334,3.15832736333,-1.64833269834,-1.15337079207,4.42843692621,3.73798665524,1.77370616277,-0.414466093183,-5.21718411287,2.14873480677,-3.09902131875,-0.431480846305,-2.21315110326,-2.32947000265,-7.03267769655,0.620159295995,0.669400061817,-1.29065409263,0.639066412349,0.412046761511,1.52948790789,1.63768410901,-3.5861120669,1.49905408064,3.24001261135,-1.20556717555,-1.63470236778,-0.0621758023897,-2.13516124328,1.88267453392,-0.0397303390498
+-1.94168348829,1.77759615302,0.00324969291651,2.76537520647,0.356809294373,-3.04903903445,-0.571212081513,0.542071000835,-0.3627079765,1.24325743755,-0.427730951508,0.239423566062,3.11484637578,0.816348610718,2.79456279387,3.34600726088,-2.36868370374,0.960648590526,0.492966024081,1.63726032575,-0.520594614346,-0.710762829333,0.599766778151,-1.0725793888,1.89727054477,1.25175032437,2.9051876543,1.70391878923,-1.64619573633,0.92607907027,-1.19849523346,-2.36430278246,-2.74358758171,2.20087053259,0.111789479652,-0.408449259351,-0.328728172382,4.23223049204,-0.694623126908,2.42802311579,2.87807498376,-0.740068741241,1.62616318464,0.592944381834,0.159329541226,0.917487897477,1.93800289412,0.471566647292,2.55488344431,3.85311472585,1.20064109365,-2.55722387075,-1.1082152708,-1.02037551522,0.175513311128,2.44115464489,-1.72615523438,0.765665462018,0.977433196902,-1.83432733496,0.349625592828,-1.36312687636,0.715892488958,4.3416105775,-1.06046443701,-0.509649908741,0.497787223061,-1.32805623293,-0.711287250494,-3.12120837728,-0.91976089676,0.324390023948,1.0570640707,1.21327393075,0.919257143736,0.591331963142,0.457342879871,1.42952108284,-0.507037276824,-0.131746374767,-0.758843362331,0.87595297138,1.89250633915,1.27972093304,-1.20407567422,-1.03513803601,-1.98328871648,-0.134607525474,-1.06089170476,-3.55350990852,-3.87543780128,-3.44827607234,-1.73182748934,0.474614856841,0.146985151768,1.38470371882,1.86053468158,-2.31340465764,-2.91458182531,-2.94684989293,0.104040072957,-5.12927719911,-2.74529750084,-1.73663758914,-0.050694579085,-1.07352878233,-2.26959511399,3.31798489332,-2.0237891674,-0.758091919621,1.11872776558,-0.914398193459,0.528536718686,-2.51926944176,1.82730301281,-0.9518930234,-0.139356404841,-1.27573636502,-1.00687736809,0.303327493866,-2.61500696495,0.686027350427,-1.85459803333,-0.927233275571,-0.823916974465,1.41521901513,-2.20459855944,-1.11158712417,-2.42684030851,-0.775827880999,-0.958329697748,-2.08249924024,-1.46892851353,2.33831092993,2.1452542154,1.52739960074,2.11092267672,1.58193236212,4.76442153255,-0.500175990462,1.07725728969,-0.0380358799299,-0.134679699142,1.48794374386,-0.768634611766,-0.826269167265,3.30978691737,0.666516949734,-0.977266769807,2.49315859,-2.23554114183,-2.87566772004,-2.56910360535,-2.05376130993,-0.0498415172097,0.0825265093635,-2.29967758298,1.79486357128,2.97849754334,0.294754260181,-0.787193464239,1.07911070456,2.28606698573,1.0229565537,3.31828083058,2.20088501116,-2.9493214941,0.495515686273,3.67637633016,-0.658510478187,1.48509448349,0.636143160462,-0.197983719906,1.11500002464,0.257854927381,-2.22390707294,-0.292455268702,-0.33748634686,-2.67893269607,1.85805808067,3.25056247751,-1.99736208757,-0.936352628816,-1.63319216132,0.0197323211029,0.691710200409,-1.60304873069,2.2222357738,-0.494030532042,-1.08713753581,3.57224936565,-3.01036009913,-1.89224094709,0.0352522730817,-1.16673329006,-0.490160132845,-0.675803392779,-1.24046576689,1.81585133443,0.898491098881,0.944541202783,-1.58925671836,0.205119132002,0.531537915468,0.253908309937,-0.676644065382,2.24614178936,1.33602100372,-0.244497090975,1.76761640933,-1.35158142765,-0.414446250596,3.73523249467,-6.43518327713,1.29597712395,4.63066692571,-0.613321131865,0.561347877184,0.0711209956796,-0.127557443778,3.31162866752,-3.6926500086,-0.285006345312,0.5099318854,4.93547654788,0.868819684585,0.137249038219,1.1507523783,-4.40680729866,-0.0998956763242,0.600819382666,-0.423278113404,-2.2000334398,1.6212370952,3.27790774345,1.55507115324,-1.22907078028,-0.029405062197,-3.98268408457,-0.990495022685,3.63038349777,0.218821062246,-0.752823298723,-0.248150258065,1.06529252927,0.178199325207,1.01655516048,-1.81574172656,3.30965251942,-1.68384102901,-1.26371297797,1.5262250487,3.47741630872,-0.265562386513,-0.801813617449,2.82347845296,0.909660657868,-3.02272153417,1.71411415577,0.936149520477,2.0352847523,-2.75044326146,-2.03834780852,1.15331309106,-1.52446875255,0.960695721459,0.943234353662,-0.174043610891,-1.19030439148,-2.10528520733,-0.142430415601,-8.60760105293,-1.17194890261,0.969270079944,-0.40029415071,2.89137278279,2.72696355025,2.00195770899,-3.36749429703,-0.503019749323,-0.34973009636,1.07590580434,1.94033220887,0.0662941793603,-0.844664263724,-0.205224726594,-0.927714525858,1.82151385903,0.716765152912,-0.505078462759,-1.39169801553,-3.75972258488,3.03527408441,-3.67419142961,-4.80039191882,3.66278994322,2.01150090774,1.10328300466,1.56799778958,-1.6706875356,2.33891484579,-2.23406077822,-0.0790168158239,-2.29835296949,-3.19061029037,-3.09279531479,-1.82349063297,0.689713494777,2.95120071332,-0.454457020759,-1.06216011772,1.86404781302,0.412750553488,-0.312192496856,-0.901166524788,-1.71619956255,-0.00137017687125,-0.982375823656,-2.71353883425,-2.3097029229,-1.10547592401,0.557556620639,-0.718295222919,0.482262715501,0.333214447946,6.6798358191,1.22103029927,-0.80201618895,-4.47059836705,-1.35593414923,-2.29788345019,0.258600590626,-0.521844846309,0.437594954967,-2.18392724584,-0.493631593385,1.37908267339,-2.26255252202,-0.30756078084,-0.831326435408,-3.69798319499,-0.223937482238,4.03011242072,3.48706426779,0.441070608024,0.828923836351,1.9435341994,1.40788590272,0.878239062827,4.71550399621,-1.09901936968,3.4838750726,0.68982342432,2.16981327931,-1.96828734874,-2.21202177366,0.926186291775,1.88568594058,-1.7648316586,0.236547902774,2.64866965254,1.89112312635,2.27105943719,2.17706474463,0.199846277238,-0.520975260338,-4.22671780745,1.23446348428,2.73591025611,-0.260378292885,2.32260232011,-1.45908730944,-2.6201878802,3.38336368958,3.02514527659,0.979315823712,-1.99782266637,-1.60707169851,0.0483122205721,-4.34822138787,-0.213511068026,-2.52483056227,-1.12644841035,-6.88962921143,0.44326542365,0.096239015261,-0.0212235212322,-0.688477359512,-0.351519578299,2.47742046833,1.44010951281,-1.97519741376,2.6616740036,2.26513570666,-0.766266692481,-2.78300611377,-0.376965727806,-2.54099842787,2.187827818,1.03102740248
diff --git a/tests/data/activitynet_features/v_test2.csv b/tests/data/activitynet_features/v_test2.csv
new file mode 100644
index 0000000000000000000000000000000000000000..a568674f6cf39b61fa91c9fa1b1934343d8ee667
--- /dev/null
+++ b/tests/data/activitynet_features/v_test2.csv
@@ -0,0 +1,6 @@
+f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,f61,f62,f63,f64,f65,f66,f67,f68,f69,f70,f71,f72,f73,f74,f75,f76,f77,f78,f79,f80,f81,f82,f83,f84,f85,f86,f87,f88,f89,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99,f100,f101,f102,f103,f104,f105,f106,f107,f108,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118,f119,f120,f121,f122,f123,f124,f125,f126,f127,f128,f129,f130,f131,f132,f133,f134,f135,f136,f137,f138,f139,f140,f141,f142,f143,f144,f145,f146,f147,f148,f149,f150,f151,f152,f153,f154,f155,f156,f157,f158,f159,f160,f161,f162,f163,f164,f165,f166,f167,f168,f169,f170,f171,f172,f173,f174,f175,f176,f177,f178,f179,f180,f181,f182,f183,f184,f185,f186,f187,f188,f189,f190,f191,f192,f193,f194,f195,f196,f197,f198,f199,f200,f201,f202,f203,f204,f205,f206,f207,f208,f209,f210,f211,f212,f213,f214,f215,f216,f217,f218,f219,f220,f221,f222,f223,f224,f225,f226,f227,f228,f229,f230,f231,f232,f233,f234,f235,f236,f237,f238,f239,f240,f241,f242,f243,f244,f245,f246,f247,f248,f249,f250,f251,f252,f253,f254,f255,f256,f257,f258,f259,f260,f261,f262,f263,f264,f265,f266,f267,f268,f269,f270,f271,f272,f273,f274,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284,f285,f286,f287,f288,f289,f290,f291,f292,f293,f294,f295,f296,f297,f298,f299,f300,f301,f302,f303,f304,f305,f306,f307,f308,f309,f310,f311,f312,f313,f314,f315,f316,f317,f318,f319,f320,f321,f322,f323,f324,f325,f326,f327,f328,f329,f330,f331,f332,f333,f334,f335,f336,f337,f338,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348,f349,f350,f351,f352,f353,f354,f355,f356,f357,f358,f359,f360,f361,f362,f363,f364,f365,f366,f367,f368,f369,f370,f371,f372,f373,f374,f375,f376,f377,f378,f379,f380,f381,f382,f383,f384,f385,f386,f387,f388,f389,f390,f391,f392,f393,f394,f395,f396,f397,f398,f399
+-2.50391422427,1.68599787994,-6.01226188664,-0.125473405835,-4.05747392075,6.31113406836,3.125083399,-1.28819161128,-0.594363160034,-4.04687042561,3.33266554158,2.05021273438,5.06569788016,-1.51135614382,-1.75754686884,-0.330255823582,2.89510802927,-0.73977406509,-7.89353751824,-3.45772308633,1.17079686934,-4.14460512795,-1.39475490187,3.86253584502,0.447348279778,3.92883117367,-4.46848521844,-3.76229701362,1.69349113829,-3.27463325871,0.924009592578,2.12999677853,2.85659594768,-4.17102590297,5.99293164916,10.2884632288,1.83231558377,1.4478797998,-4.38947245616,3.90167659309,-1.85908630842,-3.78404481822,-4.00131390917,-5.05896560394,-5.12547527286,-1.43005141799,-0.799648821025,-3.57910595264,-2.2926393485,5.31605148185,-4.44407908701,2.30758203368,4.12896344555,-2.10192899924,-1.57365770347,1.46184540219,1.02006796352,0.693975594963,-0.882507590565,-0.268305251769,-1.78810432009,-1.44049936972,-1.30807676828,-2.54602796889,1.91918086343,-1.87330246853,-1.19116743588,-4.94173944111,3.41346881759,1.04477840726,-3.87883468949,-1.6401990057,-3.11963649974,3.10739194639,2.00107403406,-3.01992488162,-2.17734208151,1.18544464156,-3.26027744456,-1.38117784752,-1.12807281493,1.23731617227,-4.22769494609,-2.31104123998,-2.73342858264,-2.60609814517,-3.91516964902,-1.43564934755,5.86923505644,10.8698481406,-0.0644558026284,1.29974983175,11.9821762355,2.63645925008,-0.800439528532,0.305979802689,10.4448009584,3.89998507623,10.3629906773,0.987935663397,1.06111665476,1.15934493999,4.74597180691,-0.53357543254,-5.53819862455,-1.08892905758,-2.84128587559,2.54403880204,3.08628575869,2.26009004126,2.77060999349,-0.582374569877,-1.77802346002,-0.2937931835,1.02838354244,3.37142584142,-6.2468072647,2.20336157741,4.02669576097,7.7139954797,-2.62292807265,-1.63856477894,5.24209850422,-5.95689444574,10.9237309757,5.56173629091,-0.06239338509,-0.11586309122,10.5260359799,0.0455641002992,-0.143587274683,6.85981490484,1.30256727268,0.099060309792,-0.99507694974,-2.39523977029,0.646837872527,-0.549287130061,0.528060432284,0.478981495421,-2.87669151504,-1.24631201746,-2.76280551886,-4.99648601327,1.56782352093,1.72098800023,-0.0553381940814,-5.35496277362,-1.12433242997,-0.526286978024,4.84426140262,-1.67891876845,-0.0265676538691,-3.17656040053,0.26415708479,4.03517758548,1.4993594204,3.83278299704,-2.77651900406,-0.861125229206,11.2030357751,-3.15313750697,-2.50459314309,1.78739187732,-7.82420403389,0.904809594012,-4.18456179152,0.60995901817,-1.44564015234,3.83168430104,-0.00437937539048,-2.3451228437,5.58568740368,2.97791145801,4.32271502614,-1.54512997459,0.536759116431,-1.1815032758,-3.14896126398,-6.86535051022,-2.70346348657,0.0113500145858,-2.77794979296,2.35137890776,-2.64285167165,-3.95364762035,-5.22867795339,6.15572625407,-6.91736113212,-1.52054794698,-2.80880080933,0.30321730122,-5.91560237718,-7.42976562356,-1.07937648743,-3.26394725639,5.0495641506,-0.553299233738,3.96384933141,-2.30659410078,-1.92410211898,-0.0740623548288,-0.741995456365,1.25729537246,3.06146581722,2.64592689772,-0.768545938319,-0.368544330909,-4.14440217226,1.39461226592,0.549227126659,-2.66866894906,2.50084337145,-6.41121511041,0.753405646177,0.280067476256,0.0344201303652,1.11097541213,-0.756136736626,-0.134220118965,5.6025168238,-2.69538654726,-1.20349766834,-2.90915489789,-3.07136878235,5.78831844318,4.79880530822,-1.54153241949,-4.93687499883,-1.02846407186,2.11793406884,1.81036372992,0.928447236083,-1.67445344365,5.93752378918,5.25534441684,-1.32955752029,5.02874157984,-8.32498580794,1.22665544488,0.729978278127,3.76998885216,1.18933444305,-4.01561953996,-1.91036380149,-2.01600540918,-2.19074894269,-6.06838036269,1.91566910093,3.16219263298,-5.36112836713,-3.03646755643,2.60723549671,-4.73392456058,-1.27864055974,1.65558185437,0.35871136493,-1.97445669054,2.00282359886,0.766041404302,0.935142604145,0.146960995005,0.90301123882,0.584378651645,2.43738964301,2.14986027277,2.13076803503,3.4849176696,3.37372560032,1.19906408345,-3.25606738189,-7.18101082565,-1.28755031363,0.930275378818,0.638566405974,4.33632120663,3.7835789624,3.41258601273,-0.279865853117,-0.651737863704,-4.7223025058,5.75545690528,-0.820105519292,-4.00676441302,2.11396374954,2.60952237005,-0.820631582523,-0.546553676079,5.33481172893,1.34852465273,2.93794032376,-1.33422280837,0.00903616898423,-2.36627310158,-4.99107783527,4.48972757256,3.85615534734,0.528791357535,5.58767522678,0.127227772965,0.973913995567,-1.8062694088,2.32322553868,-0.442473914737,-0.123751487135,-1.67863033336,0.0891421785383,2.82212784306,-0.478511586228,-3.3537200428,-0.522387139102,-4.25974474021,2.87018204241,-0.111813521457,3.94839403804,3.74490500576,-2.30623158975,1.49538655047,0.530469396242,5.1296629385,-0.453469798231,0.306027388129,0.35104102143,-2.34272025863,2.87870763106,0.212640115016,0.719817214469,-0.20345939615,-0.506974699062,5.3592568385,-2.28140813929,2.88992723737,1.65410613199,4.48693866632,-0.09672872709,-1.87582435405,-2.46928755752,-3.56278716312,1.74785164057,2.74009034813,-7.29490411233,-3.16100976408,0.847520336401,2.92602454656,-0.0986801903656,-2.16201799224,-3.39690165524,1.53765563161,-1.41997380147,2.71161737728,-0.0167333157083,1.75945290337,2.10004583364,0.765974609689,1.79493778887,3.43569638106,1.49552039321,1.90617850633,-0.592973705882,4.00305455331,0.0335191789012,1.05186070161,2.48385107847,4.89055257951,2.06091725733,-0.18432842804,-4.0123498625,-1.32194922277,2.87064841629,-2.07818711219,0.695646315956,-2.8474977249,-0.372025591391,0.277543174562,0.348284025789,-0.54074715731,2.48928393808,-5.685446576,-1.66416304574,-7.02726226008,-4.88155203391,-5.57406386037,-4.91916411608,-7.94337537982,-3.65389317081,-2.97659988583,-5.97952768511,-0.575712613136,-3.38044490327,1.89594224776,-0.106777342905,-1.21814931744,2.66339186237,2.37583883107,-2.34277046832,0.0847875222918,2.1196259109,-2.034442402,0.994460807731,-5.99126604669
+-3.61196599602,1.54396823943,-7.05199570656,0.70936037898,-4.42450754642,5.79873381853,4.79998759627,-1.51375595927,0.041889913378,-5.36947724223,3.11711617708,1.87290850281,5.37537143231,-0.140440261367,-1.07927534082,-0.8091666732,4.91609726548,-1.47799203396,-8.695467484,-4.09717354178,-1.04496299029,-3.85961924196,-2.10038466751,3.32289713025,-0.286860848963,3.96218072772,-4.39675701856,-4.40787660479,3.73622534722,-2.87716412544,0.454319910706,2.42820411325,3.82069679498,-2.79692421705,4.38538633883,10.2156878471,3.4358463645,2.12645539939,-4.04702971578,3.87549848557,-3.44834155142,-4.70891635418,-3.76960349679,-4.85522414446,-4.31793097854,-1.22963698059,0.447048375012,-2.53883199245,-3.42271156311,4.74730663896,-3.28625443876,1.15255518705,4.48008643985,-2.00973020792,0.25715895891,2.01633035838,1.72455749959,2.46865062863,-2.55920924097,-0.941734179414,-1.01115750857,-1.55530408025,-1.35561941266,-1.23846808225,4.0139059037,-2.82922329605,-1.54500077367,-4.14823132754,3.46829478144,1.42298098058,-3.60501238108,-0.478655001521,-2.27799000442,3.80441823602,0.555091810227,-4.56343603134,-3.86684781313,2.51266635656,-2.34452754557,-3.54211790189,-1.63034411222,-1.93864814639,-3.73451783657,-1.60328631774,-2.4672175467,-3.80095796585,-4.04769252539,-1.72506986559,5.59767432213,11.0820033073,-0.191732565476,1.90799899697,11.6760043621,4.55487689376,-0.31670263633,0.824923895671,8.5647937417,6.5042055428,11.780738759,1.50271001905,-0.0258838802575,0.435441556572,3.30290358961,0.377896644174,-6.5453125,-1.00815881342,-4.10386363864,1.63551698476,3.23607475758,1.42431855202,2.55384192467,-0.456127088517,-1.94804133773,0.550055715443,0.636448504358,2.32128318697,-6.70778397321,2.73787901104,3.27784690857,8.87038059237,-3.74099546671,-1.75985428691,4.34281664491,-6.43530688286,12.9979223013,6.78234988451,-0.806176937745,-0.697792875396,12.720209074,1.51877520681,0.540385435523,6.74378789664,0.843219137377,-0.0813938416541,0.253477528694,-0.220510208608,-0.133373232186,0.959342181682,1.10231779218,0.231312006339,-1.99769770503,-2.40456032157,-2.95679311156,-5.95258055926,1.98243983686,2.28856839836,-0.382299264148,-5.90337668657,-2.26504155695,-2.81989197582,5.54886015653,-2.23119397462,0.655153363942,-3.77459974289,1.65176175833,5.3708147645,0.977352631095,1.60295453668,-4.00599938631,-1.69029248208,10.0866486311,-3.23101823926,-3.1206391573,-0.391065824031,-6.68118602037,2.16630054861,-4.7760153234,0.383674836252,-2.48520847857,2.07149813026,-1.99720753431,-1.20698849112,6.08765767813,2.54862617255,4.67334094047,-2.9711537391,0.948479612171,-1.01456621587,-3.11699818373,-6.72917854786,-2.92183075547,0.496130555124,-1.61810724959,4.37298168838,-1.93378492743,-1.86215627491,-4.90786517859,8.62715418338,-7.5756526351,-3.27301322818,-1.76513157338,0.75444869213,-6.96635819673,-8.78930687905,-1.7524562791,-2.41629351974,3.68741244673,-1.43222312816,3.23068808318,-1.59724262357,-3.27234983742,1.24265492261,-0.0109941303718,2.80159805715,2.48849355877,3.07970299125,-0.557770807296,0.432648000119,-3.69374324679,0.0467125833038,0.424763832987,-3.38139162659,3.42404463887,-4.51077946425,2.03796033263,0.507232870907,-0.506469908358,1.50909484178,-1.27529992908,-0.255473581143,6.49730739594,-3.27221466898,0.583703720573,-2.57865453363,-2.25019647181,5.4004673481,4.42697024941,-0.0842542231125,-3.7730645895,-0.905618444086,2.8413999021,1.14175421931,0.425801990927,-0.551772788169,4.81836385727,2.67149700224,-1.60633691549,3.67677226961,-7.09939215183,3.07843704373,-0.603567731382,1.07058879137,-0.284542271494,-2.65182375908,-0.966910338403,-2.21251030267,-1.5918459788,-6.73685925007,2.16504070461,3.16708334088,-5.73397156,-0.0308346152315,3.96178902388,-4.34651784301,-0.626209998878,2.96317673624,1.55037861467,-1.6240209043,-0.916502046583,2.22772178277,1.73989147246,0.425792780239,2.44748416841,1.27179720402,3.01824558973,0.45870998502,1.6810954839,4.9340551734,4.52931187153,1.22776987255,-4.30461632609,-8.0007350564,0.293104887008,2.59760651291,-2.09017359019,2.84267843664,3.92640956045,4.39850687385,0.263943502309,-2.52996243984,-4.9456074357,3.01140740514,0.060671949388,-3.45182769299,3.45659797787,-0.717935073377,-1.70038859993,-0.159526935219,4.78994245529,1.73284136951,3.39466386437,-3.02896884084,0.745040215552,-2.42295794487,-5.48635936975,5.81924671531,4.81498251557,0.588836860656,5.34480842352,-1.69491340667,-0.931661537289,-1.47670565099,1.95115838945,4.33551876547,-2.35900820047,-2.03742983938,-2.51175971031,2.00818323493,-1.02861073502,-2.83876619935,-1.42532885447,-3.22665929496,3.24723680019,2.50910392105,1.66940991878,1.98924016655,-2.976414603,2.39372268021,0.0301794916395,2.93753557801,-2.53472368196,-0.224031038582,2.22086050436,-4.60367997885,0.344105190041,0.892087735609,-0.732750460502,-0.0278959076854,-2.04538312331,4.39118845462,-1.92525613308,2.48760456741,2.12224386633,4.20933679342,-0.160378366895,-0.847533833979,-2.68713091612,-2.85529101193,1.45633238703,3.13940095305,-6.84778351784,-3.07674325108,2.9240462061,1.66283178181,0.366562292727,-0.474471753836,-2.22659401149,2.12781591714,-0.698044653983,3.11203145981,-0.0878812848356,2.08509909212,2.37360790372,-0.383632448313,2.85876693129,1.43884898126,2.44588458538,1.13197429609,0.669784083962,2.82567384094,-0.303028093278,0.0804680705045,1.01148720384,3.96722738147,3.78676999509,0.484674140066,-5.0017509222,0.154588726159,2.53468632102,-2.48899200261,0.211847947538,-2.28771493435,-0.277051561698,1.01623694403,0.347248692065,-1.88412645785,0.431219244007,-5.62209599018,-2.32514169514,-6.17786878348,-4.5459565401,-5.45559768676,-5.25804600716,-7.30329209566,-4.18787643314,-1.41929989755,-6.36565381289,0.691979244352,-5.4266118586,0.243365764617,-0.33372869622,-1.60025772154,2.65902011394,1.72226278037,-3.51518207789,0.837280854209,2.64499332011,-0.451456475259,4.05596930012,-4.51415959
+-4.72683149606,1.45348708808,-8.07086817742,1.63604789376,-4.73549800873,5.20675960303,6.51230325818,-1.76839387298,0.728590119478,-6.74178866983,2.8130164218,1.58456622004,5.62148933888,1.37578496694,-0.371593541978,-1.41557620727,7.0383985126,-2.29083102226,-9.45700079202,-4.80206114411,-3.43986400128,-3.55278479934,-2.83554306328,2.7735268724,-1.13780232042,3.92281681627,-4.25488941192,-5.10927104115,5.96552311688,-2.43940485954,-0.0862283119556,2.73895709873,4.89024929762,-1.2763541922,2.57022780523,9.9613841939,5.07362765074,2.82582543075,-3.62501172424,3.7390643692,-5.19941673696,-5.66170942306,-3.52688271404,-4.6018137145,-3.43782470346,-0.992310488373,1.76652944327,-1.43113125652,-4.60094419718,3.99586562991,-2.03482079327,-0.160126103461,4.7740121144,-1.88776037335,2.26084538698,2.65253681004,2.54412336618,4.38450802416,-4.3977601847,-1.7176710071,-0.0724306311467,-1.70681380391,-1.41692107796,0.200332455933,6.24482979595,-3.83351793349,-1.88694544792,-3.24113301516,3.48263311743,1.83456811458,-3.1987385869,0.769642335775,-1.36940517485,4.47494917393,-1.01712017417,-6.15526720286,-5.62981627226,3.9166711688,-1.23287549198,-5.84563351884,-2.13252854615,-5.38287308335,-3.12790068805,-0.774887352436,-2.1297221756,-5.0906492424,-4.12367990136,-1.97023809493,5.23813544751,11.0778312242,-0.275825666287,2.59604639888,11.1118171802,6.55417260289,0.203035293669,1.38965836696,6.4515772891,9.32944820284,13.2775346517,2.04594562918,-1.18929040372,-0.312611132264,1.6740041858,1.40754847616,-7.60108621597,-0.907561735809,-5.39238245725,0.626936522051,3.35088065982,0.46351477623,2.31236622334,-0.229608643204,-2.07551843763,1.55680642903,0.263669897775,1.0858634612,-7.05738488197,3.32455673039,2.40335632682,10.0899427987,-4.92568675757,-1.80175588966,3.28225847542,-6.88330174923,14.9608820614,8.02759130716,-1.60224438258,-1.24848374822,14.9900966168,3.09142677188,1.2888044706,6.5442295146,0.330789602659,-0.286123776287,1.62822659672,2.06531837225,-0.982651502788,2.60571396113,1.63691263556,0.01017631717,-1.03312850952,-3.68506930947,-3.12813932538,-6.89839523554,2.3975418067,2.95167421162,-0.811870859787,-6.43306355715,-3.44969232738,-5.32219609171,6.3486418271,-2.75835331619,1.37597230494,-4.40136899472,3.19074914694,6.78243587256,0.445585229398,-0.808829127549,-5.32398023844,-2.61561192304,8.69628513216,-3.31122705817,-3.75478894711,-2.72484310418,-5.34768217325,3.53855306476,-5.38706000924,0.145446739923,-3.58612233102,0.120355840028,-4.15744045019,0.0731746891131,6.55438641787,1.99956796408,4.91731314421,-4.42644771397,1.40971697062,-0.784811406731,-3.00484983444,-6.53485749721,-3.15200479388,1.03534908369,-0.301970368177,6.51142239392,-1.10611675471,0.418995252622,-4.4721977675,11.1724257183,-8.21665349245,-5.11762260079,-0.615411399901,1.18636612185,-8.06906448126,-10.1247596884,-2.49426667422,-1.32065032601,2.17061477065,-2.33631666951,2.38926856876,-0.913166025876,-4.7118704623,2.72928834141,0.775672697726,4.4457443577,1.71014433921,3.57591197133,-0.235582885593,1.25215531408,-3.14634150744,-1.4078004086,0.365033659041,-4.17761438727,4.40297134757,-2.42336025477,3.4580388701,0.689679874331,-1.04557964027,1.87770598858,-1.80380414367,-0.417696796171,7.45841611862,-3.81225969553,2.56200723887,-2.21683688522,-1.32409115911,4.95071142197,3.92624093532,1.46352795839,-2.46225001812,-0.77849281013,3.50410349012,0.434351972267,-0.0288636657596,0.669223650095,3.49293913841,-0.137764969467,-1.96554630518,2.1402142328,-5.7265598917,5.16214273542,-2.05637966395,-1.8495585683,-1.87955528319,-1.25644548416,0.00674796104395,-2.43147389591,-0.893102669418,-7.49637273312,2.34914988339,3.13358963132,-6.12764425039,3.23036705017,5.41211955786,-3.91730147004,0.0444684042034,4.39211372912,2.92113072753,-1.25977230668,-4.12997387886,3.87697173372,2.66106281221,0.736292763781,4.03323895753,2.06197661877,3.74714529276,-1.27023549199,1.21123526514,6.49754122019,5.87128979206,1.2765970856,-5.3870420897,-8.90884536504,2.01509624004,4.4446681577,-5.09674575568,1.20312212527,4.04149165512,5.50566021562,0.953406482342,-4.55933359832,-5.21267021895,-0.036395560507,1.1284481287,-2.80024212361,4.99020810008,-4.41919901133,-2.62691727608,0.226202066541,4.16152264595,2.0979556495,3.87861913562,-4.9043425262,1.60233154863,-2.46861632347,-6.0349463439,7.17580538869,5.88561519026,0.718002053499,5.10737453699,-3.68287960738,-3.00543767631,-1.03803471714,1.53446617425,9.3747028375,-4.76337719411,-2.39580845952,-5.3522044754,1.13427948356,-1.6372946959,-2.29562118411,-2.37800694928,-2.10207263529,3.68294849873,5.38075784862,-0.940855975155,0.0137967544802,-3.74462119222,3.33829092682,-0.57550301969,0.537392029762,-4.84174327537,-0.825007719694,4.19546295956,-7.04726793528,-2.39606908321,1.61995286934,-2.34724253952,0.159427139386,-3.66048334882,3.28457990646,-1.59395935536,2.02604223549,2.65396766722,3.91925804377,-0.170175538174,0.293078864813,-2.97810955763,-2.11363542974,1.19750591725,3.54246556639,-6.34636378288,-2.98813998103,5.24311850287,0.266658103764,0.848274391745,1.48310565829,-0.99932412535,2.74228922785,0.028886015862,3.42641401768,-0.174800014277,2.45710129201,2.67823993087,-1.63095737636,3.88755993008,-0.699719142316,3.417716069,0.163006665744,2.16666536272,1.66770118028,-0.553962221444,-1.03107923508,-0.689737435581,2.84424331307,5.59421723187,1.20365538374,-6.0307972002,1.79253649413,2.07976007581,-2.97050522506,-0.320198328197,-1.71101762295,-0.148553741649,1.92997103455,0.389586392492,-3.34172380107,-1.60005307674,-5.45010868966,-3.076508376,-5.23991111994,-4.07970976352,-5.24768321514,-5.51570352555,-6.46153886914,-4.78648862958,0.280570674728,-6.66282331825,2.05202573478,-7.5744939363,-1.66311737061,-0.568106225319,-1.98653774977,2.69276298046,1.04291445166,-4.88652718305,1.6799737481,3.19981912076,1.09642167091,7.33881660357,-2.92239319682
+-3.95618640951,2.16822504699,-7.02749201775,2.07438584924,-3.7952008903,4.66516063452,5.66598080516,-1.93683131397,0.83286083467,-6.31038688779,1.93803728581,0.415994385479,4.63695873261,2.03064954996,-0.546765608815,-2.54600209773,6.67720080018,-2.60086139083,-8.36665858864,-5.08000973701,-3.84362360537,-3.51486208201,-2.64075744003,3.07348869205,-1.94571852326,3.0428294871,-3.48582068503,-5.26945194721,6.5893364191,-2.27115260124,-0.558212063015,2.65741990924,5.38911813021,-0.610317340195,1.36496483032,7.88430027903,4.24496084571,2.5491838041,-2.95291282773,2.46365449905,-5.8806508565,-5.27971760869,-3.57540645719,-4.17462575197,-3.20521330357,-0.712461964526,1.66458856776,-1.43753664225,-4.29921654403,2.28583934903,-1.82383457958,-1.12579432636,3.8323690407,-1.60873620778,2.88645622611,3.1870587337,3.35539863348,4.68089458585,-5.01220222473,-2.40511398852,1.23198447682,-2.04995642841,-1.54208872378,0.738531426192,6.23694182634,-3.66800229013,-1.47559821933,-2.51566377998,2.96481087386,1.93647179783,-1.85266061902,0.897218961718,-1.2290535754,3.62848708004,-1.39016747028,-5.53799726665,-5.19588583469,3.79989851355,0.365908132196,-5.86183534264,-1.74588927373,-6.0965897572,-2.17361679807,0.099301538021,-1.49651467532,-5.28756560326,-3.35764337569,-1.22807119251,4.41288296581,8.37310397655,0.329299056678,3.0666925776,8.31520066255,6.03162533879,0.254658643305,1.52927615046,5.15474370718,9.92706954478,13.1178707933,1.9851475221,-1.25251645445,-0.040588879585,0.598402907254,2.09637820482,-7.39962798595,-0.736607771963,-4.72784618586,0.148764773328,2.82482881815,-0.363951296807,2.18847515703,0.851648719757,-1.44513312698,2.82303802848,0.789665968129,-0.284895439446,-5.39480451405,3.52706449866,1.50199447424,9.94445934776,-4.85012166024,-0.775828022365,2.07768519119,-6.15859429717,12.0614514388,7.37984260201,-1.64554053068,-0.434650133851,14.1951656962,3.12879480362,1.52092895806,5.6518155706,0.0597475437445,-0.432820611596,2.15243572235,1.70108392119,-1.19518387556,3.0659382844,0.729992161989,0.512096637264,-0.702464946806,-4.23238757848,-2.71316921115,-6.04356548428,2.08492669598,3.63833817005,-1.76652027816,-5.79197620272,-3.09022756994,-6.01349622488,6.92608562946,-2.03923279405,1.31198180869,-4.27980091691,3.90416300416,6.64981202126,0.73166857958,-1.23485268474,-5.4199275887,-3.10880723954,6.33416883498,-3.2787891686,-3.49453917981,-2.87733795069,-3.98702534318,3.87149213552,-5.16316780805,0.178835353982,-3.50880401373,-0.771996193229,-4.59445316195,0.868211128412,5.75491086721,0.921819759609,3.39493911088,-3.67554339618,1.67544182837,-0.174868727922,-2.08721256792,-5.95615169048,-3.12308293462,1.30280533791,0.644019361586,6.33218312264,-0.25693573624,1.04176057992,-3.36895969659,10.1426500809,-7.50808531523,-4.85486101508,-0.170589606464,0.612994321586,-7.87276499986,-8.79793308139,-2.78509446978,0.942439908986,1.39931613266,-1.95726648182,1.68011825532,-1.75475023031,-4.74921035767,3.71489373327,0.868516312915,4.43326895118,-0.263135685322,3.9764669311,0.911694865376,0.85224120736,-2.35560669035,-1.62565724194,1.2212044698,-4.61154775619,4.34895780444,-1.68536224604,4.06422766924,-0.0101673817625,-0.609392880799,1.22532760024,-1.5149737785,-0.805999085308,7.55067921162,-2.93719872087,3.43533396363,-2.10260034561,-0.721583162695,4.52110221148,2.69720968336,1.40812491387,-1.62846618414,-0.822517428993,2.23470644593,0.491862057373,0.920802225173,0.962496383188,1.928562572,-0.802637988328,-2.72160144806,1.0092707017,-4.93745543241,6.46554609537,-2.43392473698,-2.37087579571,-2.17133839786,-1.93240495443,-0.362681306601,-2.54449704886,-0.17978616923,-8.05280478001,1.39086142182,2.67881788671,-6.08614060402,3.92572582901,5.49754135013,-3.72346940279,-0.242022804468,4.81397798061,4.11047571898,-1.36651873588,-5.34488024235,4.95870956659,3.41118116498,0.89432107985,3.33253220856,2.74165137768,5.04070746183,-0.415948872567,1.31926612794,6.72856174469,7.17419068098,1.49098495662,-4.98007160067,-9.318038764,2.46224850535,5.27640871287,-6.26628448487,0.635381773711,3.60578859449,6.173201437,2.24732711256,-4.89329962254,-5.55538270712,-1.49875565291,2.64946635843,-2.09067063332,6.20336785316,-6.25677093268,-2.50105109721,-0.0861860245474,3.59812706232,1.57726798058,3.84794261813,-5.72557672262,2.46239029348,-2.29553559303,-6.28103302002,6.47278197646,6.46319063902,1.48405849189,5.35767221928,-4.23237529636,-3.51878979206,-0.00904786854982,1.29577608407,8.77539933744,-5.03432886004,-2.11539484441,-6.16999167681,1.0546652633,-1.90332779229,-2.35973056435,-2.26917619407,-1.82008438647,4.08268388271,6.31470301866,-3.08372749806,-1.22069035709,-4.38186541558,3.19182102323,-1.42976873428,-0.223793095648,-5.89660835981,-1.25134502113,3.99110957295,-7.45729860783,-2.86559789747,1.66721295506,-3.13464591861,0.162813140824,-3.38049943731,2.39996716856,-2.15944387913,1.63885930896,3.04169135332,3.98578349114,0.511457957626,0.823394746482,-3.67019996286,-2.25544205963,1.80545994013,3.28000457585,-6.05162557602,-3.00187867403,6.49878694773,-0.326051785648,0.684602611069,3.36035886407,-1.228521097,2.57487190307,-0.46660696879,2.10812581897,-0.305482617393,2.75176966548,2.83328473449,-1.89653189778,2.65913075805,-0.83185869336,2.94031493856,-1.53106848534,3.9481344676,2.79967945367,0.710376281441,-1.93211027801,-2.24844452739,1.20713421225,5.22792970717,1.27727831364,-5.73701616764,2.55549032926,0.93986610532,-3.48593280315,-0.51567519635,-1.94204506159,0.172434514092,3.41956290126,0.900014420896,-3.65240677357,0.294835821394,-4.22226468399,-3.63110159874,-4.85140349388,-2.80221052408,-4.28761808038,-4.3011406827,-4.58334078341,-5.13591312647,0.760158468181,-5.32113479346,2.1639226532,-7.19870259762,-3.37775546551,-0.481121961772,-1.74219072804,3.14396611452,1.24187298924,-6.32387711763,2.16209208607,3.14260455966,-0.531431690456,7.58907546639,-2.70918695331
+-1.8262197373,3.46346980632,-4.49737847328,2.16065784335,-1.95281272531,4.15987870455,2.97505878091,-2.04312422812,0.517240521534,-4.57863372207,0.651493945123,-1.38716154456,2.76521640778,2.06453320265,-1.35841371835,-4.05420722187,4.5255736053,-2.54840182424,-5.94124334216,-5.05016509056,-2.81190917194,-3.67080595732,-1.77554705888,3.98575968385,-2.7226165092,1.5568113219,-2.26458370864,-5.039455657,6.05570743084,-2.2971960926,-0.980765613618,2.29306887269,5.47656385183,-0.560339287222,0.599394002372,4.49311896622,1.63815704465,1.56890586585,-2.10052304626,0.367122573851,-5.79060420752,-3.93543901801,-3.83389718414,-3.62215631246,-3.43940053463,-0.401958022528,0.53790163979,-2.24713481337,-2.93054077685,-0.115260034797,-2.36293837487,-1.84129033774,1.99996710196,-1.21648682684,2.51857071042,3.64827320695,4.16069695712,3.80975566268,-4.74414714813,-3.02875908077,2.80003528588,-2.53125300467,-1.71329928577,0.627459498644,4.61502670049,-2.65913504899,-0.521180084049,-1.92113643766,2.06333797991,1.81511531889,0.170950590374,0.216834144594,-1.64254453719,1.68837884694,-0.898700688779,-3.32811955512,-3.17814456463,2.586751405,2.31587963283,-4.22904350161,-0.718470119983,-4.84180047393,-0.968689443319,1.00650176987,-0.650119360983,-4.69666749001,-1.9845663628,0.225895456072,3.25188346505,3.7214648661,1.43130174562,3.38060764193,3.90915834465,3.69100523353,-0.0311959603429,1.36241446137,4.44646521807,8.91873133182,11.7640150213,1.48888324678,-0.522589141129,0.966836237908,-0.0783090251672,2.53949897528,-6.29179323673,-0.514931401759,-2.65529345423,0.0529807021281,1.83676325411,-1.09529018879,2.14935440898,2.54911320939,-0.268381892444,4.27633724332,1.96361587535,-1.7532244429,-2.28158183038,3.45261253476,0.581260310115,8.81487321379,-3.86599963188,1.0199303931,0.769287056774,-4.58845028997,5.65657658659,5.3673800838,-1.14614109278,1.36291043639,11.190714488,2.05933052063,1.38084109723,4.25990110517,-0.0372709861028,-0.537679631114,2.06346488953,-0.573661087751,-0.94866688013,2.67103304624,-1.21616028428,1.53528989427,-0.828803119361,-4.25112649202,-1.87550593019,-3.89059672713,1.24767834112,4.34198590755,-3.09971417338,-4.30684231401,-1.61756324589,-5.39918883562,7.34323934078,-0.421631439929,0.682198462336,-3.61873383403,4.0223959434,5.40389529228,1.60736526251,-0.229281746149,-4.63487404227,-3.29047121048,3.27148656696,-3.16514086127,-2.58948973179,-1.45728034504,-2.60679310679,3.4551587224,-4.33734436273,0.408040666226,-2.58206250429,-0.901034320293,-3.78914433122,1.31350503355,4.04259036302,-0.537077046634,0.599152674081,-1.33412403017,1.80021581352,0.709162880183,-0.588873988985,-5.10033129215,-2.90737100959,1.37433995247,1.32315881923,4.48205828667,0.607754543123,0.468945158216,-1.78444975466,6.53551485418,-5.82657202005,-3.07283199429,-0.23395036608,-0.685120877029,-6.73997298956,-5.55178875566,-2.75079527497,4.04717801094,1.16547028959,-0.65315918714,1.06632480771,-3.69622943163,-3.77567577362,4.33929287315,0.461161797867,3.2264848721,-3.09787745297,4.30806201577,2.65380481884,-0.426790667771,-1.38944570095,-0.951971263289,2.73767599046,-4.78429574967,3.55026640534,-1.92020277738,4.08365875006,-1.34608724355,0.52991460502,-0.16311301112,-0.636902204679,-1.35728861392,7.01656522274,-1.04194504768,3.51204951167,-2.16685251236,-0.352366254777,4.10601737738,0.943123545944,0.196937171517,-1.13858823061,-0.989929439423,-0.42760035634,1.10082056798,2.88289318532,0.586945049761,0.191918394566,0.0784438192848,-3.76375469208,0.170746930539,-4.56917799234,7.20640591383,-2.03627742291,-1.162803858,-1.52358367979,-4.10165442705,-1.70038300634,-2.58114453554,0.544036584797,-8.4628292799,-0.39101603627,1.92033407032,-5.73090517998,2.77129089892,4.59895916582,-3.6993329978,-1.21856652394,4.50981304645,5.16903883219,-1.81281971931,-5.11896033764,5.63131075621,4.03798224807,0.94242796123,0.983446211217,3.34165998936,6.74135187387,2.30066786289,1.84391614258,5.99896759033,8.44891975641,1.82473051131,-3.49935032487,-9.36754787922,1.99033073068,5.37617359877,-6.11145027161,0.840345951319,2.77300786376,6.52381299019,3.97701953709,-4.00499682903,-5.95263335466,-1.81812204599,4.49724048495,-1.33929533959,7.18550524711,-6.75075093508,-1.61648165077,-0.901867833736,3.08160940886,0.417978906632,3.44625248551,-5.78684989214,3.32444414914,-1.96475922584,-6.30903808594,4.28494357228,6.6853062272,2.70926908404,5.95959033489,-3.74483410478,-2.90718505382,1.44551556975,1.18541310728,4.11113968076,-3.76723547578,-1.37445659459,-5.52958389044,1.54767837942,-1.92233352125,-2.86162799716,-1.39507161677,-2.14555080593,4.45648285389,5.85169536352,-4.88964028836,-1.92099967062,-4.92469491005,2.25878873229,-2.46324559539,0.196596455872,-6.0487574029,-1.55177951395,2.21588104457,-6.40127635479,-1.69814975291,1.22380428523,-3.3257760489,0.0335933651775,-1.73429207817,1.6753979218,-3.37144515515,1.30529874444,3.32560100317,4.3093956852,1.69156472504,0.913729201853,-4.65146396518,-3.03416330755,3.03830222517,2.53774605692,-5.90589033127,-3.08939878225,6.98781540632,-0.339520339072,0.055654079916,5.17970392108,-2.50772905916,1.84376598061,-1.84338212296,-0.387204087972,-0.467715920136,2.99068574548,2.88048758626,-1.45433287024,-0.196598118542,0.482496773005,1.41811820263,-3.74789556999,5.93477154017,5.5825525865,3.06712063462,-2.68137378276,-3.70440641046,-0.800623167756,3.29453107536,0.885642924309,-4.48956893444,2.6876345849,-0.693841806652,-4.02581026554,-0.468509003222,-2.75538569212,0.6321949444,5.32430804849,1.74790291831,-3.13624450207,5.02021304012,-2.23322165832,-4.04383488655,-4.85901101112,-0.93985485792,-2.78530479312,-2.02520967245,-1.95793826431,-5.30569067717,0.360007514108,-2.79794396788,1.37599081039,-5.00351879596,-4.95408667088,-0.162458266318,-1.04320560053,3.89612897635,2.07402950018,-7.80881192922,2.38426132559,2.64415159821,-4.4487659061,5.65304963946,-3.48982639909
diff --git a/tests/data/annotations/action_test_anno.json b/tests/data/annotations/action_test_anno.json
new file mode 100644
index 0000000000000000000000000000000000000000..28ef0acbb60b0d7507c5f2e32fb7cd9ca2494284
--- /dev/null
+++ b/tests/data/annotations/action_test_anno.json
@@ -0,0 +1,34 @@
+    {
+        "v_test1": {
+            "duration_second": 1,
+            "duration_frame": 30,
+            "annotations": [
+                {
+                    "segment": [
+                        0.3,
+                        0.6
+                    ],
+                    "label": "Rock climbing"
+                }
+            ],
+            "feature_frame": 30,
+            "fps": 30.0,
+            "rfps": 30
+        },
+        "v_test2": {
+            "duration_second": 2,
+            "duration_frame": 48,
+            "annotations": [
+                {
+                    "segment": [
+                        1.0,
+                        2.0
+                    ],
+                    "label": "Drinking beer"
+                }
+            ],
+            "feature_frame": 48,
+            "fps": 24.0,
+            "rfps": 24.0
+        }
+    }
diff --git a/tests/data/annotations/audio_feature_test_list.txt b/tests/data/annotations/audio_feature_test_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e2bc7268bca69f8bfd61b29e2b72d316f05634f2
--- /dev/null
+++ b/tests/data/annotations/audio_feature_test_list.txt
@@ -0,0 +1,2 @@
+test 100 127
+test 100 127
diff --git a/tests/data/annotations/audio_test_list.txt b/tests/data/annotations/audio_test_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..247dea4f4d910fa839b5283296ad08f26d4d3d64
--- /dev/null
+++ b/tests/data/annotations/audio_test_list.txt
@@ -0,0 +1,2 @@
+test.wav 100 127
+test.wav 100 127
diff --git a/tests/data/annotations/hvu_frame_test_anno.json b/tests/data/annotations/hvu_frame_test_anno.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa9e5da4f1fc7571ac05a6d42ca869fd710006a7
--- /dev/null
+++ b/tests/data/annotations/hvu_frame_test_anno.json
@@ -0,0 +1,24 @@
+[
+  {
+    "frame_dir":"imgs",
+    "total_frames":5,
+    "label":{
+        "concept":[250, 131, 42, 51, 57, 155, 122],
+        "object":[1570, 508],
+        "event":[16],
+        "action":[180],
+        "scene":[206]
+    }
+  },
+  {
+    "frame_dir":"imgs",
+    "total_frames":5,
+    "label":{
+        "concept":[250, 131, 42, 51, 57, 155, 122],
+        "object":[1570, 508],
+        "event":[16],
+        "action":[180],
+        "scene":[206]
+    }
+  }
+]
diff --git a/tests/data/annotations/hvu_video_eval_test_anno.json b/tests/data/annotations/hvu_video_eval_test_anno.json
new file mode 100644
index 0000000000000000000000000000000000000000..a77398ba6c9df29d0f7d4e38cc2246ea80559cf2
--- /dev/null
+++ b/tests/data/annotations/hvu_video_eval_test_anno.json
@@ -0,0 +1,18 @@
+[
+  {
+    "filename":"test.mp4",
+    "label":{
+        "action": [2],
+        "scene": [2],
+        "object": [1]
+    }
+  },
+  {
+    "filename":"test.avi",
+    "label":{
+        "action": [1],
+        "scene": [1],
+        "object": [2]
+    }
+  }
+]
diff --git a/tests/data/annotations/hvu_video_test_anno.json b/tests/data/annotations/hvu_video_test_anno.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5b2c58f2b428ce8e27b3bd23e35cf3ad3514f1a
--- /dev/null
+++ b/tests/data/annotations/hvu_video_test_anno.json
@@ -0,0 +1,22 @@
+[
+  {
+    "filename":"tmp.mp4",
+    "label":{
+        "concept":[250, 131, 42, 51, 57, 155, 122],
+        "object":[1570, 508],
+        "event":[16],
+        "action":[180],
+        "scene":[206]
+    }
+  },
+  {
+    "filename":"tmp.mp4",
+    "label":{
+        "concept":[250, 131, 42, 51, 57, 155, 122],
+        "object":[1570, 508],
+        "event":[16],
+        "action":[180],
+        "scene":[206]
+    }
+  }
+]
diff --git a/tests/data/annotations/proposal_normalized_list.txt b/tests/data/annotations/proposal_normalized_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9d43bee42a0cf59f5c6ab204eb62b163070e1630
--- /dev/null
+++ b/tests/data/annotations/proposal_normalized_list.txt
@@ -0,0 +1,18 @@
+# 0
+imgs
+5
+1
+2
+3 0.2000 0.4000
+3 0.6000 1.0000
+10
+3 1.0000 1.0000 0.2000 0.4000
+3 0.5000 0.5000 0.2000 0.6000
+3 0.3333 0.3333 0.2000 0.8000
+3 0.5000 0.5000 0.2000 1.0000
+3 0.0000 0.0000 0.4000 0.6000
+3 0.3333 0.5000 0.4000 0.8000
+3 0.6666 0.6666 0.4000 1.0000
+3 0.5000 1.0000 0.6000 0.8000
+3 1.0000 1.0000 0.6000 1.0000
+3 0.5000 1.0000 0.8000 1.0000
diff --git a/tests/data/annotations/proposal_test_list.txt b/tests/data/annotations/proposal_test_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ba446556f439892c25614bc35bdcea37ddad56a1
--- /dev/null
+++ b/tests/data/annotations/proposal_test_list.txt
@@ -0,0 +1,18 @@
+# 0
+imgs
+5
+1
+2
+3 1 2
+3 3 5
+10
+3 1.0000 1.0000 1 2
+3 0.5000 0.5000 1 3
+3 0.3333 0.3333 1 4
+3 0.5000 0.5000 1 5
+3 0.0000 0.0000 2 3
+3 0.3333 0.5000 2 4
+3 0.6666 0.6666 2 5
+3 0.5000 1.0000 3 4
+3 1.0000 1.0000 3 5
+3 0.5000 1.0000 4 5
diff --git a/tests/data/annotations/rawframe_test_list.txt b/tests/data/annotations/rawframe_test_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b2c5b7d86afb4d6f46df30e10bd4740a1c3526c2
--- /dev/null
+++ b/tests/data/annotations/rawframe_test_list.txt
@@ -0,0 +1,2 @@
+imgs 5 127
+imgs 5 127
diff --git a/tests/data/annotations/rawframe_test_list_multi_label.txt b/tests/data/annotations/rawframe_test_list_multi_label.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f5247c74dffba7560243f2c075b3d93adde7eaf7
--- /dev/null
+++ b/tests/data/annotations/rawframe_test_list_multi_label.txt
@@ -0,0 +1,2 @@
+imgs 5 1
+imgs 5 3 5
diff --git a/tests/data/annotations/rawframe_test_list_with_offset.txt b/tests/data/annotations/rawframe_test_list_with_offset.txt
new file mode 100644
index 0000000000000000000000000000000000000000..620ed78a877063c458a6adca621165c7d2f94874
--- /dev/null
+++ b/tests/data/annotations/rawframe_test_list_with_offset.txt
@@ -0,0 +1,2 @@
+imgs 2 5 127
+imgs 2 5 127
diff --git a/tests/data/annotations/rawvideo_test_anno.json b/tests/data/annotations/rawvideo_test_anno.json
new file mode 100644
index 0000000000000000000000000000000000000000..f67ad8e56794680cbd806697cf0d4ea0c6ad1f5f
--- /dev/null
+++ b/tests/data/annotations/rawvideo_test_anno.json
@@ -0,0 +1,8 @@
+[
+  {
+    "video_dir":"rawvideo_dataset",
+    "label":1,
+    "num_clips":2,
+    "positive_clip_inds":[0]
+  }
+]
diff --git a/tests/data/annotations/rawvideo_test_anno.txt b/tests/data/annotations/rawvideo_test_anno.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1bdd573e27f57a09032e8874d5746d58ee5899fe
--- /dev/null
+++ b/tests/data/annotations/rawvideo_test_anno.txt
@@ -0,0 +1 @@
+rawvideo_dataset 1 2 0
diff --git a/tests/data/annotations/sample.pkl b/tests/data/annotations/sample.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..8bd245b9a6269c66c85bfc9b75722ac1315a13bf
--- /dev/null
+++ b/tests/data/annotations/sample.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2eacddfc5e56956a7af7e3a3f7c57fac9d8cf5892e36f16274a1f2f5217f16c2
+size 278252
diff --git a/tests/data/annotations/video_test_list.txt b/tests/data/annotations/video_test_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7a7fb78cb27309683dc65a61a717206a9d460d33
--- /dev/null
+++ b/tests/data/annotations/video_test_list.txt
@@ -0,0 +1,2 @@
+test.mp4 0
+test.mp4 0
diff --git a/tests/data/annotations/video_test_list_multi_label.txt b/tests/data/annotations/video_test_list_multi_label.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0f1a510d7981711a49a9875c35a3dc1cdd601341
--- /dev/null
+++ b/tests/data/annotations/video_test_list_multi_label.txt
@@ -0,0 +1,2 @@
+test.mp4 0 3
+test.mp4 0 2 4
diff --git a/tests/data/annotations/video_text_test_list.json b/tests/data/annotations/video_text_test_list.json
new file mode 100644
index 0000000000000000000000000000000000000000..99e968fe88499ca6264b612bc73c722eab14c3f5
--- /dev/null
+++ b/tests/data/annotations/video_text_test_list.json
@@ -0,0 +1 @@
+{"test.mp4": ["A person is cleaning a swimming pool", "A person is using a cleaning machine to clean the swimming pool"]}
\ No newline at end of file
diff --git a/tests/data/ava_dataset/action_list.txt b/tests/data/ava_dataset/action_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0a5d442bbf947694c274798064263bdca31255d2
--- /dev/null
+++ b/tests/data/ava_dataset/action_list.txt
@@ -0,0 +1,16 @@
+item {
+  name: "action1"
+  id: 12
+}
+item {
+  name: "action2"
+  id: 17
+}
+item {
+  name: "action3"
+  id: 79
+}
+item {
+  name: "action3"
+  id: 80
+}
diff --git a/tests/data/ava_dataset/ava_excluded_timestamps_sample.csv b/tests/data/ava_dataset/ava_excluded_timestamps_sample.csv
new file mode 100644
index 0000000000000000000000000000000000000000..3b353a69e9dc311688c8470fc6ec5da18e3a273d
--- /dev/null
+++ b/tests/data/ava_dataset/ava_excluded_timestamps_sample.csv
@@ -0,0 +1,2 @@
+0f39OWEqJ24,0903
+_-Z6wFjXtGQ,0902
diff --git a/tests/data/ava_dataset/ava_proposals_sample.pkl b/tests/data/ava_dataset/ava_proposals_sample.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..e6113a28ca8da045ffda783d4cdb227012bd8c3f
--- /dev/null
+++ b/tests/data/ava_dataset/ava_proposals_sample.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c688e3fdd42655fd4d744813a4ba507d1349917f28215b8c9eba03a3aa9e4b8
+size 476
diff --git a/tests/data/ava_dataset/ava_sample.csv b/tests/data/ava_dataset/ava_sample.csv
new file mode 100644
index 0000000000000000000000000000000000000000..888369ce272399e3a514da987f2293a326a7e933
--- /dev/null
+++ b/tests/data/ava_dataset/ava_sample.csv
@@ -0,0 +1,8 @@
+0f39OWEqJ24,0902,0.031,0.162,0.670,0.995,12,0
+0f39OWEqJ24,0902,0.031,0.162,0.670,0.995,17,0
+0f39OWEqJ24,0902,0.031,0.162,0.670,0.995,79,0
+0f39OWEqJ24,0903,0.034,0.189,0.669,0.980,12,0
+0f39OWEqJ24,0903,0.034,0.189,0.669,0.980,17,0
+_-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,12,0
+_-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,74,0
+_-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,80,0
diff --git a/tests/data/bsp_features/v_test1.npy b/tests/data/bsp_features/v_test1.npy
new file mode 100644
index 0000000000000000000000000000000000000000..57e96ee51702ddd221ebf17961feb07cca02b9c4
--- /dev/null
+++ b/tests/data/bsp_features/v_test1.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bf3a0c38540d78aac01ce02a17af366cd95c54e05d80748fafa294f2ad19964
+size 170368
diff --git a/tests/data/eval_detection/action_list.txt b/tests/data/eval_detection/action_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b6ca0ac4a6732343148d6b90c884f275c6c68fe2
--- /dev/null
+++ b/tests/data/eval_detection/action_list.txt
@@ -0,0 +1,12 @@
+item {
+  name: "action1"
+  id: 1
+}
+item {
+  name: "action2"
+  id: 2
+}
+item {
+  name: "action3"
+  id: 3
+}
diff --git a/tests/data/eval_detection/gt.csv b/tests/data/eval_detection/gt.csv
new file mode 100644
index 0000000000000000000000000000000000000000..b68e8f60d8692d48477fef5d431d51a611d1c353
--- /dev/null
+++ b/tests/data/eval_detection/gt.csv
@@ -0,0 +1,12 @@
+3reY9zJKhqN,1774,0.278,0.203,0.964,0.677,3,0
+3reY9zJKhqN,1774,0.050,0.230,0.522,0.952,1,1
+3reY9zJKhqN,1774,0.154,0.039,0.757,0.743,1,2
+3reY9zJKhqN,1774,0.428,0.482,0.659,0.607,2,3
+HmR8SmNIoxu,1384,0.278,0.296,0.729,0.957,3,0
+HmR8SmNIoxu,1384,0.254,0.371,0.677,0.859,3,1
+HmR8SmNIoxu,1384,0.061,0.318,0.584,0.710,1,2
+HmR8SmNIoxu,1384,0.484,0.483,0.895,0.837,3,3
+5HNXoce1raG,1097,0.195,0.031,1.000,0.664,2,0
+5HNXoce1raG,1097,0.047,0.218,0.512,0.504,1,1
+5HNXoce1raG,1097,0.362,0.465,0.932,0.696,2,2
+5HNXoce1raG,1097,0.446,0.156,0.856,0.951,3,3
diff --git a/tests/data/eval_detection/pred.csv b/tests/data/eval_detection/pred.csv
new file mode 100644
index 0000000000000000000000000000000000000000..ff14331acc0ead0199bbe05902b4b68222cb0c8a
--- /dev/null
+++ b/tests/data/eval_detection/pred.csv
@@ -0,0 +1,30 @@
+3reY9zJKhqN,1774,0.072,0.470,0.840,0.898,2,0.655
+3reY9zJKhqN,1774,0.230,0.215,0.781,0.534,1,0.949
+3reY9zJKhqN,1774,0.195,0.128,0.643,0.944,1,0.640
+3reY9zJKhqN,1774,0.236,0.189,0.689,0.740,3,0.681
+3reY9zJKhqN,1774,0.375,0.371,0.726,0.804,3,0.425
+3reY9zJKhqN,1774,0.024,0.398,0.776,0.719,1,0.160
+3reY9zJKhqN,1774,0.477,0.135,0.959,0.967,2,0.753
+3reY9zJKhqN,1774,0.435,0.071,0.966,0.578,1,0.088
+3reY9zJKhqN,1774,0.089,0.494,0.583,0.669,1,0.084
+3reY9zJKhqN,1774,0.136,0.129,0.507,0.532,1,0.041
+HmR8SmNIoxu,1384,0.152,0.299,0.599,0.577,1,0.060
+HmR8SmNIoxu,1384,0.360,0.170,0.731,0.987,3,0.138
+HmR8SmNIoxu,1384,0.348,0.193,0.533,0.727,2,0.429
+HmR8SmNIoxu,1384,0.242,0.396,0.875,0.907,2,0.470
+HmR8SmNIoxu,1384,0.496,0.023,0.730,0.673,3,0.473
+HmR8SmNIoxu,1384,0.038,0.025,0.843,0.570,1,0.606
+HmR8SmNIoxu,1384,0.156,0.193,0.836,0.836,2,0.388
+HmR8SmNIoxu,1384,0.433,0.072,0.962,0.755,3,0.787
+HmR8SmNIoxu,1384,0.430,0.026,0.948,0.524,2,0.518
+HmR8SmNIoxu,1384,0.273,0.210,0.907,0.712,3,0.396
+5HNXoce1raG,1097,0.331,0.328,0.783,0.825,3,0.157
+5HNXoce1raG,1097,0.140,0.195,0.558,0.983,3,0.989
+5HNXoce1raG,1097,0.130,0.207,0.761,0.523,2,0.976
+5HNXoce1raG,1097,0.145,0.444,0.611,0.571,1,0.560
+5HNXoce1raG,1097,0.448,0.116,0.513,0.657,1,0.131
+5HNXoce1raG,1097,0.468,0.361,0.511,0.512,2,0.608
+5HNXoce1raG,1097,0.321,0.093,0.749,0.841,1,0.298
+5HNXoce1raG,1097,0.018,0.137,0.650,0.832,3,0.390
+5HNXoce1raG,1097,0.002,0.417,0.851,0.573,1,0.083
+5HNXoce1raG,1097,0.130,0.389,0.872,0.611,2,0.912
diff --git a/tests/data/eval_detection/proposal.pkl b/tests/data/eval_detection/proposal.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1f55c278e90030cf48a6c6d0b4ca3a0234c4d1a3
--- /dev/null
+++ b/tests/data/eval_detection/proposal.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c86e60cdae116dda8683d7bb68d3ac49e0beea8042371122acea1ee2f63faa1e
+size 2085
diff --git a/tests/data/eval_localization/gt.json b/tests/data/eval_localization/gt.json
new file mode 100644
index 0000000000000000000000000000000000000000..a82f034b59c1195b6e2bcc8f39857e30d202e40b
--- /dev/null
+++ b/tests/data/eval_localization/gt.json
@@ -0,0 +1,46 @@
+{
+    "v_bYUmtLBL7W4": {
+        "duration": 224.49,
+        "subset": "validation",
+        "resolution": "1920x1080",
+        "url": "https://www.youtube.com/watch?v=bYUmtLBL7W4",
+        "annotations": [
+            {
+                "segment": [
+                    11.553655226209049,
+                    57.06805460218409
+                ],
+                "label": "Wakeboarding"
+            },
+            {
+                "segment": [
+                    68.62170982839314,
+                    126.03987519500778
+                ],
+                "label": "Wakeboarding"
+            },
+            {
+                "segment": [
+                    135.4928658346334,
+                    201.31368954758187
+                ],
+                "label": "Wakeboarding"
+            }
+        ]
+    },
+    "v_hDPLy21Yyuk": {
+        "duration": 76.23,
+        "subset": "validation",
+        "resolution": "1280x720",
+        "url": "https://www.youtube.com/watch?v=hDPLy21Yyuk",
+        "annotations": [
+            {
+                "segment": [
+                    21.392480499219968,
+                    76.161
+                ],
+                "label": "Cleaning shoes"
+            }
+        ]
+    }
+}
diff --git a/tests/data/eval_localization/result.json b/tests/data/eval_localization/result.json
new file mode 100644
index 0000000000000000000000000000000000000000..04ea06e4b9b597b6d9a642d76cbd3d97a54789b8
--- /dev/null
+++ b/tests/data/eval_localization/result.json
@@ -0,0 +1,120 @@
+{
+    "results": {
+        "bYUmtLBL7W4": [
+            {
+                "label": "Wakeboarding",
+                "score": 0.6533445119857788,
+                "segment": [
+                    0.0,
+                    206.3465619982159
+                ]
+            },
+            {
+                "label": "Wakeboarding",
+                "score": 0.5620265007019043,
+                "segment": [
+                    33.64346119536128,
+                    206.3465619982159
+                ]
+            },
+            {
+                "label": "Wakeboarding",
+                "score": 0.4421495497226715,
+                "segment": [
+                    148.03122925958965,
+                    204.1036645851918
+                ]
+            },
+            {
+                "label": "Wakeboarding",
+                "score": 0.31284379959106445,
+                "segment": [
+                    0.0,
+                    123.35935771632472
+                ]
+            },
+            {
+                "label": "Wakeboarding",
+                "score": 0.2897574603557587,
+                "segment": [
+                    67.28692239072257,
+                    206.3465619982159
+                ]
+            },
+            {
+                "label": "Wakeboarding",
+                "score": 0.284942090511322,
+                "segment": [
+                    33.64346119536128,
+                    125.60225512934882
+                ]
+            },
+            {
+                "label": "Wakeboarding",
+                "score": 0.12905514240264893,
+                "segment": [
+                    0.0,
+                    53.829537912578054
+                ]
+            },
+            {
+                "label": "Wakeboarding",
+                "score": 0.12616874277591705,
+                "segment": [
+                    67.28692239072257,
+                    123.35935771632472
+                ]
+            },
+            {
+                "label": "Wakeboarding",
+                "score": 0.12591737508773804,
+                "segment": [
+                    100.93038358608386,
+                    204.1036645851918
+                ]
+            },
+            {
+                "label": "Wakeboarding",
+                "score": 0.10444077104330064,
+                "segment": [
+                    38.12925602140946,
+                    53.829537912578054
+                ]
+            }
+        ],
+        "hDPLy21Yyuk": [
+            {
+                "label": "Cleaning shoes",
+                "score": 0.5667440891265869,
+                "segment": [
+                    21.222965776805253,
+                    75.03834328227572
+                ]
+            },
+            {
+                "label": "Cleaning shoes",
+                "score": 0.414698988199234,
+                "segment": [
+                    21.222965776805253,
+                    43.96185768052516
+                ]
+            },
+            {
+                "label": "Cleaning shoes",
+                "score": 0.21768000721931455,
+                "segment": [
+                    0.0,
+                    75.03834328227572
+                ]
+            },
+            {
+                "label": "Cleaning shoes",
+                "score": 0.10800375044345856,
+                "segment": [
+                    29.560559474835888,
+                    70.49056490153174
+                ]
+            }
+        ]
+    }
+}
diff --git a/tests/data/eval_multisports/data_samples.pkl b/tests/data/eval_multisports/data_samples.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..ca2b050c40607b5d7f13ffcfbdce7489991e7266
--- /dev/null
+++ b/tests/data/eval_multisports/data_samples.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6d2edf0d7168573a7007df44b0078379310038265ab9452ad28cc630a68e48b
+size 368784
diff --git a/tests/data/eval_multisports/gt.pkl b/tests/data/eval_multisports/gt.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..8a74fbb169c76b4e5f849b0308c9498c98ad1aae
--- /dev/null
+++ b/tests/data/eval_multisports/gt.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc51c0efc5516e0472a568ecbdb782c6557c706c72c3129a19d2e261a5d86d4
+size 7891
diff --git a/tests/data/imgs/img_00001.jpg b/tests/data/imgs/img_00001.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e846e5af2e8cad0c4d99f440b0d1d7709f82fd26
Binary files /dev/null and b/tests/data/imgs/img_00001.jpg differ
diff --git a/tests/data/imgs/img_00002.jpg b/tests/data/imgs/img_00002.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6d7c81b31702ec1e861eb94fbeed4509d4a23d75
Binary files /dev/null and b/tests/data/imgs/img_00002.jpg differ
diff --git a/tests/data/imgs/img_00003.jpg b/tests/data/imgs/img_00003.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6289b32ecf59281f846de097bad6d577b9fb59a4
Binary files /dev/null and b/tests/data/imgs/img_00003.jpg differ
diff --git a/tests/data/imgs/img_00004.jpg b/tests/data/imgs/img_00004.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a75094d0d5b64889ed0c36c5ecdb98428ecd3b94
Binary files /dev/null and b/tests/data/imgs/img_00004.jpg differ
diff --git a/tests/data/imgs/img_00005.jpg b/tests/data/imgs/img_00005.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..25828b83669ad72e1e76fb249282899798167eaf
Binary files /dev/null and b/tests/data/imgs/img_00005.jpg differ
diff --git a/tests/data/imgs/img_00006.jpg b/tests/data/imgs/img_00006.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7f0fa6ca5ce2bd44b3c2108c9024cafcd9e12fc7
Binary files /dev/null and b/tests/data/imgs/img_00006.jpg differ
diff --git a/tests/data/imgs/img_00007.jpg b/tests/data/imgs/img_00007.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2ebc51fe1b110d26e1201299eaad86c4ee5c0460
Binary files /dev/null and b/tests/data/imgs/img_00007.jpg differ
diff --git a/tests/data/imgs/img_00008.jpg b/tests/data/imgs/img_00008.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f9747042fbb3c2409c013b288076bdc9d2a0d3aa
Binary files /dev/null and b/tests/data/imgs/img_00008.jpg differ
diff --git a/tests/data/imgs/img_00009.jpg b/tests/data/imgs/img_00009.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b4a74ebb0debc4fcbd9c96e75fb679383a05fbbb
Binary files /dev/null and b/tests/data/imgs/img_00009.jpg differ
diff --git a/tests/data/imgs/img_00010.jpg b/tests/data/imgs/img_00010.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9944e620895f613649e97c7cd74a4c3f6d1ab746
Binary files /dev/null and b/tests/data/imgs/img_00010.jpg differ
diff --git a/tests/data/imgs/x_00001.jpg b/tests/data/imgs/x_00001.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..705ba4b6aee3fd579f0c6ff3edf709b27bacdb8b
Binary files /dev/null and b/tests/data/imgs/x_00001.jpg differ
diff --git a/tests/data/imgs/x_00002.jpg b/tests/data/imgs/x_00002.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f5016755fb98a2f7f81913b85d1c8e728f6098bb
Binary files /dev/null and b/tests/data/imgs/x_00002.jpg differ
diff --git a/tests/data/imgs/x_00003.jpg b/tests/data/imgs/x_00003.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f419d712d874e67305799c39818c8375e3c66d15
Binary files /dev/null and b/tests/data/imgs/x_00003.jpg differ
diff --git a/tests/data/imgs/x_00004.jpg b/tests/data/imgs/x_00004.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cb52d25933899bc3bee8a29e36fa22554f0b2e31
Binary files /dev/null and b/tests/data/imgs/x_00004.jpg differ
diff --git a/tests/data/imgs/x_00005.jpg b/tests/data/imgs/x_00005.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..399fda2544f4c819ec86bcd460d90fd90a27c1c2
Binary files /dev/null and b/tests/data/imgs/x_00005.jpg differ
diff --git a/tests/data/imgs/y_00001.jpg b/tests/data/imgs/y_00001.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..743b0b2a6d1c16093a5a20b1d0583d791145d4b9
Binary files /dev/null and b/tests/data/imgs/y_00001.jpg differ
diff --git a/tests/data/imgs/y_00002.jpg b/tests/data/imgs/y_00002.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..37f84d07eec4abc29311f743636c9e33d67bbe15
Binary files /dev/null and b/tests/data/imgs/y_00002.jpg differ
diff --git a/tests/data/imgs/y_00003.jpg b/tests/data/imgs/y_00003.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..938a5b6cdc6280c477f88f1815177888200aead5
Binary files /dev/null and b/tests/data/imgs/y_00003.jpg differ
diff --git a/tests/data/imgs/y_00004.jpg b/tests/data/imgs/y_00004.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..af4c666c4c411c97ab8d48034ddaaa0b8c05855a
Binary files /dev/null and b/tests/data/imgs/y_00004.jpg differ
diff --git a/tests/data/imgs/y_00005.jpg b/tests/data/imgs/y_00005.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..41e05d707236b7cf9cb3c48d01ac3b8d2cbfe3bf
Binary files /dev/null and b/tests/data/imgs/y_00005.jpg differ
diff --git a/tests/data/lfb/lfb_unittest.pkl b/tests/data/lfb/lfb_unittest.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..9d93e4da01f55ba007e0682846c315d9d64594b3
--- /dev/null
+++ b/tests/data/lfb/lfb_unittest.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9e8ec0dcce016f4f35d69e3386ba5c4e449ad623eddd97de92c42b79670c0da
+size 81082
diff --git a/tests/data/multisports_dataset/multisports_proposals_sample.pkl b/tests/data/multisports_dataset/multisports_proposals_sample.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..4ee2f3e359be2aa7fd84f8b78e3f564c72bcf8da
--- /dev/null
+++ b/tests/data/multisports_dataset/multisports_proposals_sample.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97e778f7f2b9adf28215625ee71aaa62b522931cdf2945235ce5fa06a7968dcb
+size 2982
diff --git a/tests/data/multisports_dataset/multisports_sample.csv b/tests/data/multisports_dataset/multisports_sample.csv
new file mode 100644
index 0000000000000000000000000000000000000000..d457a7399b591f74491c23b19250d3cc25a3d163
--- /dev/null
+++ b/tests/data/multisports_dataset/multisports_sample.csv
@@ -0,0 +1,9 @@
+aerobic_gymnastics/v_aqMgwPExjD0_c001.mp4,377,0.706,0.439,0.794,0.811,11,0
+aerobic_gymnastics/v_aqMgwPExjD0_c001.mp4,378,0.689,0.438,0.794,0.804,11,0
+aerobic_gymnastics/v_aqMgwPExjD0_c001.mp4,379,0.672,0.419,0.802,0.797,11,0
+aerobic_gymnastics/v_aqMgwPExjD0_c001.mp4,380,0.680,0.361,0.791,0.783,11,0
+aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,443,0.109,0.669,0.345,0.768,1,0
+aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,444,0.112,0.668,0.347,0.767,1,0
+aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,445,0.115,0.663,0.350,0.761,1,0
+aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,446,0.117,0.644,0.352,0.757,1,0
+aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,447,0.118,0.636,0.352,0.754,1,0
\ No newline at end of file
diff --git a/tests/data/proposals/v_test1.csv b/tests/data/proposals/v_test1.csv
new file mode 100644
index 0000000000000000000000000000000000000000..9d4f4a575bf295cc7d28ebb987285779eaeef4ea
--- /dev/null
+++ b/tests/data/proposals/v_test1.csv
@@ -0,0 +1,10 @@
+tmin,tmax,tmin_score,tmax_score,score,match_iou,match_ioa
+0.1,0.2,0.95,0.96,0.97,0.85,0.84
+0.2,0.3,0.94,0.95,0.96,0.84,0.83
+0.3,0.4,0.93,0.94,0.95,0.83,0.82
+0.4,0.5,0.92,0.93,0.94,0.82,0.81
+0.5,0.6,0.91,0.92,0.93,0.81,0.80
+0.6,0.7,0.90,0.91,0.92,0.80,0.79
+0.5,0.7,0.90,0.91,0.92,0.80,0.79
+0.6,0.8,0.90,0.91,0.92,0.80,0.79
+0.4,0.7,0.90,0.91,0.92,0.80,0.79
diff --git a/tests/data/proposals/v_test2.csv b/tests/data/proposals/v_test2.csv
new file mode 100644
index 0000000000000000000000000000000000000000..a2863df6e6cd7e527490d9919cf896bda45995e7
--- /dev/null
+++ b/tests/data/proposals/v_test2.csv
@@ -0,0 +1,7 @@
+tmin,tmax,tmin_score,tmax_score,score,match_iou,match_ioa
+0.1,0.2,0.95,0.96,0.97,0.75,0.74
+0.2,0.3,0.94,0.95,0.96,0.74,0.73
+0.3,0.4,0.93,0.94,0.95,0.73,0.72
+0.4,0.5,0.92,0.93,0.94,0.72,0.71
+0.5,0.6,0.91,0.92,0.93,0.71,0.70
+0.6,0.7,0.90,0.91,0.92,0.70,0.79
diff --git a/tests/data/rawvideo_dataset/part_1.mp4 b/tests/data/rawvideo_dataset/part_1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..fb5b091c6ca597ad605b3b125eb47cf24295468d
--- /dev/null
+++ b/tests/data/rawvideo_dataset/part_1.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:019048c7c7e6ceb3b4821f027ff3cda0e198ab33dd28b32d9ee7e397be87b4b3
+size 158581
diff --git a/tests/data/tem_results/v_test1.csv b/tests/data/tem_results/v_test1.csv
new file mode 100644
index 0000000000000000000000000000000000000000..5ec36a078ec9483906af43074bb3dee54d941ca0
--- /dev/null
+++ b/tests/data/tem_results/v_test1.csv
@@ -0,0 +1,11 @@
+action,start,end,tmin,tmax
+3.711169585585594177e-02,5.839086771011352539e-01,1.464508026838302612e-01,0.0,0.1
+1.555041410028934479e-02,3.062666654586791992e-01,2.622193098068237305e-01,0.1,0.2
+1.146762818098068237e-02,1.464279890060424805e-01,3.260520696640014648e-01,0.2,0.3
+1.371797081083059311e-02,1.365097165107727051e-01,3.570831716060638428e-01,0.3,0.4
+1.519643329083919525e-02,1.688144057989120483e-01,3.057994544506072998e-01,0.4,0.5
+1.968025043606758118e-02,1.974480003118515015e-01,2.933082580566406250e-01,0.5,0.6
+2.251588553190231323e-02,1.885317713022232056e-01,3.326449990272521973e-01,0.6,0.7
+2.402217499911785126e-02,1.918197423219680786e-01,3.420312106609344482e-01,0.7,0.8
+2.045033127069473267e-02,1.970291137695312500e-01,3.339000344276428223e-01,0.8,0.9
+3.435279428958892822e-02,5.583426356315612793e-01,1.250019371509552002e-01,0.9,1.0
diff --git a/tests/data/tem_results/v_test2.csv b/tests/data/tem_results/v_test2.csv
new file mode 100644
index 0000000000000000000000000000000000000000..79f0685ea72932ea1bbef6716410b6ec41896718
--- /dev/null
+++ b/tests/data/tem_results/v_test2.csv
@@ -0,0 +1,11 @@
+action,start,end,tmin,tmax
+5.711169585585594177e-02,7.839086771011352539e-01,3.464508026838302612e-01,0.0,0.1
+2.555041410028934479e-02,3.062666654586791992e-01,3.622193098068237305e-01,0.1,0.2
+2.146762818098068237e-02,2.464279890060424805e-01,3.260520696640014648e-01,0.2,0.3
+1.371797081083059311e-02,1.365097165107727051e-01,3.570831716060638428e-01,0.3,0.4
+1.519643329083919525e-02,1.688144057989120483e-01,3.057994544506072998e-01,0.4,0.5
+1.968025043606758118e-02,1.974480003118515015e-01,2.933082580566406250e-01,0.5,0.6
+2.251588553190231323e-02,1.885317713022232056e-01,3.326449990272521973e-01,0.6,0.7
+2.402217499911785126e-02,1.918197423219680786e-01,3.420312106609344482e-01,0.7,0.8
+2.045033127069473267e-02,1.970291137695312500e-01,3.339000344276428223e-01,0.8,0.9
+3.435279428958892822e-02,5.583426356315612793e-01,1.250019371509552002e-01,0.9,1.0
diff --git a/tests/data/test.avi b/tests/data/test.avi
new file mode 100644
index 0000000000000000000000000000000000000000..b65a08432cb327a7592f72e244a73cb8dcdb9d88
--- /dev/null
+++ b/tests/data/test.avi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc3ebb11e80d2900071ff929633b7476a33ee1698ac9a91206e2ba64c1c28920
+size 294566
diff --git a/tests/data/test.jpg b/tests/data/test.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d88aea0ac50bce6efdde58c2248bbd25d1ae9122
Binary files /dev/null and b/tests/data/test.jpg differ
diff --git a/tests/data/test.mp4 b/tests/data/test.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..883a242d1bfe37908c2d9545de5bba067029de33
--- /dev/null
+++ b/tests/data/test.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38eff15224f44090631dc29b28804f5781c07c1a579e918a74d3e3bb9d12cb59
+size 1352828
diff --git a/tests/data/test.wav b/tests/data/test.wav
new file mode 100644
index 0000000000000000000000000000000000000000..4ff616e5b8f1b8468cc319ea845ffa02da6bf7bb
--- /dev/null
+++ b/tests/data/test.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c139b1dcd0ebebbe6417038d75126b25fbf259b7993eedce5130bc200f55049
+size 419710
diff --git a/tests/datasets/__init__.py b/tests/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..41c3272a7cce96aeda4fccabef088e589347e63a
--- /dev/null
+++ b/tests/datasets/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseTestDataset
+
+__all__ = ['BaseTestDataset']
diff --git a/tests/datasets/base.py b/tests/datasets/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e72a3a1e0b8e536f0bb1040600fc22e0ec7de600
--- /dev/null
+++ b/tests/datasets/base.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+from mmengine import ConfigDict
+
+
+class BaseTestDataset:
+
+    @classmethod
+    def setup_class(cls):
+        # prefix path
+        cls.data_prefix = osp.normpath(
+            osp.join(osp.dirname(__file__), '../data/'))
+        cls.ann_file_prefix = osp.join(cls.data_prefix, 'annotations')
+
+        # annotations path
+        cls.action_ann_file = osp.join(cls.ann_file_prefix,
+                                       'action_test_anno.json')
+        cls.audio_feature_ann_file = osp.join(cls.ann_file_prefix,
+                                              'audio_feature_test_list.txt')
+        cls.audio_ann_file = osp.join(cls.ann_file_prefix,
+                                      'audio_test_list.txt')
+        cls.frame_ann_file_multi_label = osp.join(
+            cls.ann_file_prefix, 'rawframe_test_list_multi_label.txt')
+        cls.frame_ann_file_with_offset = osp.join(
+            cls.ann_file_prefix, 'rawframe_test_list_with_offset.txt')
+        cls.frame_ann_file = osp.join(cls.ann_file_prefix,
+                                      'rawframe_test_list.txt')
+        cls.hvu_frame_ann_file = osp.join(cls.ann_file_prefix,
+                                          'hvu_frame_test_anno.json')
+        cls.hvu_video_ann_file = osp.join(cls.ann_file_prefix,
+                                          'hvu_video_test_anno.json')
+        cls.hvu_video_eval_ann_file = osp.join(
+            cls.ann_file_prefix, 'hvu_video_eval_test_anno.json')
+        cls.proposal_ann_file = osp.join(cls.ann_file_prefix,
+                                         'proposal_test_list.txt')
+        cls.proposal_norm_ann_file = osp.join(cls.ann_file_prefix,
+                                              'proposal_normalized_list.txt')
+        cls.rawvideo_test_anno_json = osp.join(cls.ann_file_prefix,
+                                               'rawvideo_test_anno.json')
+        cls.rawvideo_test_anno_txt = osp.join(cls.ann_file_prefix,
+                                              'rawvideo_test_anno.txt')
+        cls.video_ann_file = osp.join(cls.ann_file_prefix,
+                                      'video_test_list.txt')
+        cls.video_ann_file_multi_label = osp.join(
+            cls.ann_file_prefix, 'video_test_list_multi_label.txt')
+        cls.video_text_ann_file = osp.join(cls.ann_file_prefix,
+                                           'video_text_test_list.json')
+        cls.pose_ann_file = osp.join(cls.ann_file_prefix, 'sample.pkl')
+
+        # pipeline configuration
+        cls.action_pipeline = []
+        cls.audio_feature_pipeline = [
+            dict(type='LoadAudioFeature'),
+            dict(
+                type='SampleFrames',
+                clip_len=32,
+                frame_interval=2,
+                num_clips=1),
+            dict(type='AudioFeatureSelector')
+        ]
+        cls.audio_pipeline = [
+            dict(type='AudioDecodeInit'),
+            dict(
+                type='SampleFrames',
+                clip_len=32,
+                frame_interval=2,
+                num_clips=1),
+            dict(type='AudioDecode')
+        ]
+        cls.frame_pipeline = [
+            dict(
+                type='SampleFrames',
+                clip_len=32,
+                frame_interval=2,
+                num_clips=1),
+            dict(type='RawFrameDecode', io_backend='disk')
+        ]
+        cls.proposal_pipeline = [
+            dict(
+                type='SampleProposalFrames',
+                clip_len=1,
+                body_segments=5,
+                aug_segments=(2, 2),
+                aug_ratio=0.5),
+            dict(type='RawFrameDecode', io_backend='disk')
+        ]
+        cls.proposal_test_pipeline = [
+            dict(
+                type='SampleProposalFrames',
+                clip_len=1,
+                body_segments=5,
+                aug_segments=(2, 2),
+                aug_ratio=0.5,
+                mode='test'),
+            dict(type='RawFrameDecode', io_backend='disk')
+        ]
+        cls.proposal_train_cfg = ConfigDict(
+            dict(
+                ssn=dict(
+                    assigner=dict(
+                        positive_iou_threshold=0.7,
+                        background_iou_threshold=0.01,
+                        incomplete_iou_threshold=0.5,
+                        background_coverage_threshold=0.02,
+                        incomplete_overlap_threshold=0.01),
+                    sampler=dict(
+                        num_per_video=8,
+                        positive_ratio=1,
+                        background_ratio=1,
+                        incomplete_ratio=6,
+                        add_gt_as_proposals=True),
+                    loss_weight=dict(
+                        comp_loss_weight=0.1, reg_loss_weight=0.1),
+                    debug=False)))
+        cls.proposal_test_cfg = ConfigDict(
+            dict(
+                ssn=dict(
+                    sampler=dict(test_interval=6, batch_size=16),
+                    evaluater=dict(
+                        top_k=2000,
+                        nms=0.2,
+                        softmax_before_filter=True,
+                        cls_top_k=2))))
+        cls.proposal_test_cfg_topall = ConfigDict(
+            dict(
+                ssn=dict(
+                    sampler=dict(test_interval=6, batch_size=16),
+                    evaluater=dict(
+                        top_k=-1,
+                        nms=0.2,
+                        softmax_before_filter=True,
+                        cls_top_k=2))))
+        cls.rawvideo_pipeline = []
+        cls.video_pipeline = [
+            dict(type='OpenCVInit'),
+            dict(
+                type='SampleFrames',
+                clip_len=32,
+                frame_interval=2,
+                num_clips=1),
+            dict(type='OpenCVDecode')
+        ]
+
+        cls.video_text_pipeline = [
+            dict(type='OpenCVInit'),
+            dict(
+                type='SampleFrames',
+                clip_len=32,
+                frame_interval=2,
+                num_clips=1),
+            dict(type='OpenCVDecode'),
+            dict(type='CLIPTokenize')
+        ]
+
+        cls.hvu_categories = [
+            'action', 'attribute', 'concept', 'event', 'object', 'scene'
+        ]
+        cls.hvu_category_nums = [739, 117, 291, 69, 1679, 248]
+        cls.hvu_categories_for_eval = ['action', 'scene', 'object']
+        cls.hvu_category_nums_for_eval = [3, 3, 3]
+
+        cls.filename_tmpl = 'img_{:05d}.jpg'
diff --git a/tests/datasets/test_ava_dataset.py b/tests/datasets/test_ava_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..a327a031e54bf2408b78bc04dbae38de4a02c7fe
--- /dev/null
+++ b/tests/datasets/test_ava_dataset.py
@@ -0,0 +1,353 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine
+import numpy as np
+from mmengine.testing import assert_dict_has_keys
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+from mmaction.datasets import AVADataset, AVAKineticsDataset
+from mmaction.utils import register_all_modules
+
+
+class TestAVADataset:
+
+    @classmethod
+    def setup_class(cls):
+        cls.data_prefix = osp.normpath(
+            osp.join(osp.dirname(__file__), './../data', 'ava_dataset'))
+        cls.label_file = osp.join(cls.data_prefix, 'action_list.txt')
+        cls.ann_file = osp.join(cls.data_prefix, 'ava_sample.csv')
+        cls.exclude_file = osp.join(cls.data_prefix,
+                                    'ava_excluded_timestamps_sample.csv')
+        cls.proposal_file = osp.join(cls.data_prefix,
+                                     'ava_proposals_sample.pkl')
+        cls.pipeline = [
+            dict(type='SampleAVAFrames', clip_len=32, frame_interval=2)
+        ]
+        cls.proposal = mmengine.load(cls.proposal_file)
+
+    def test_ava_dataset(self):
+        register_all_modules()
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            self.exclude_file,
+            self.label_file,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file)
+
+        # custom classes
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            self.exclude_file,
+            label_file=self.label_file,
+            custom_classes=[17, 79],
+            num_classes=3,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file)
+        # ava_infos = ava_dataset.video_infos
+        target_labels = np.array([1, 2])
+        labels = np.zeros([3])
+        labels[target_labels] = 1.
+        target_labels = labels[None, ...]
+
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            None,
+            self.label_file,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file)
+
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            None,
+            self.label_file,
+            test_mode=True,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file)
+
+        del ava_dataset
+
+    def test_ava_pipeline(self):
+        register_all_modules()
+        target_keys = [
+            'frame_dir', 'video_id', 'timestamp', 'img_key', 'shot_info',
+            'fps', 'filename_tmpl', 'modality', 'start_index',
+            'timestamp_start', 'timestamp_end', 'proposals', 'scores',
+            'frame_inds', 'clip_len', 'frame_interval', 'gt_labels',
+            'gt_bboxes', 'entity_ids'
+        ]
+
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            self.exclude_file,
+            self.label_file,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file)
+        result = ava_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
+
+        assert result['filename_tmpl'] == 'img_{:05}.jpg'
+        assert result['modality'] == 'RGB'
+        assert result['start_index'] == 1
+        assert result['timestamp_start'] == 900
+        assert result['timestamp_end'] == 1800
+        assert_array_equal(result['proposals'],
+                           np.array([[0.011, 0.157, 0.655, 0.983]]))
+        assert_array_equal(result['scores'], np.array([0.998163]))
+
+        assert result['clip_len'] == 32
+        assert result['frame_interval'] == 2
+        assert len(result['frame_inds']) == 32
+
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            None,
+            self.label_file,
+            test_mode=True,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file)
+        # Try to get a sample
+        result = ava_dataset[0]
+        assert result['filename_tmpl'] == 'img_{:05}.jpg'
+        assert result['modality'] == 'RGB'
+        assert result['start_index'] == 1
+        assert result['timestamp_start'] == 900
+        assert result['timestamp_end'] == 1800
+
+
+class TestMultiSportsDataset:
+
+    @classmethod
+    def setup_class(cls):
+        cls.data_prefix = osp.normpath(
+            osp.join(
+                osp.dirname(__file__), './../data', 'multisports_dataset'))
+        cls.ann_file = osp.join(cls.data_prefix, 'multisports_sample.csv')
+        cls.proposal_file = osp.join(cls.data_prefix,
+                                     'multisports_proposals_sample.pkl')
+        cls.pipeline = [
+            dict(type='DecordInit'),
+            dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
+            dict(type='DecordDecode')
+        ]
+        cls.proposal = mmengine.load(cls.proposal_file)
+
+    def test_multisports_dataset(self):
+        register_all_modules()
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file,
+            use_frames=False,
+            timestamp_start=1,
+            start_index=0,
+            multilabel=False,
+            fps=1)
+
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            test_mode=True,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file,
+            use_frames=False,
+            timestamp_start=1,
+            start_index=0,
+            multilabel=False,
+            fps=1)
+
+        del ava_dataset
+
+    def test_ava_pipeline(self):
+        register_all_modules()
+        target_keys = [
+            'filename', 'video_id', 'timestamp', 'img_key', 'shot_info', 'fps',
+            'filename_tmpl', 'modality', 'start_index', 'timestamp_start',
+            'timestamp_end', 'proposals', 'scores', 'frame_inds', 'clip_len',
+            'frame_interval', 'gt_labels', 'gt_bboxes', 'entity_ids'
+        ]
+
+        def mock_video_reader(filename):
+            from unittest.mock import MagicMock
+            container = MagicMock()
+            container.__len__.return_value = 100
+            container.get_avg_fps.return_value = 24
+            frame_batch = MagicMock()
+            frame_batch.asnumpy.return_value = np.zeros((32, 720, 1280, 3))
+            container.get_batch.return_value = frame_batch
+            return container
+
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file,
+            use_frames=False,
+            timestamp_start=1,
+            start_index=0,
+            multilabel=False,
+            fps=1)
+
+        # Mock a decord Container
+        ava_dataset.pipeline.transforms[
+            0]._get_video_reader = mock_video_reader
+        result = ava_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
+
+        assert result['modality'] == 'RGB'
+        assert result['fps'] == 1
+        assert result['start_index'] == 0
+
+        h, w = result['imgs'][0].shape[:2]
+        scale_factor = np.array([w, h, w, h])
+        gt_bboxes = np.array([[0.71097612, 0.44144461, 0.79291363, 0.80873633],
+                              [0.19915699, 0.40121613, 0.29834411,
+                               0.79667876]])
+        assert_array_almost_equal(
+            result['proposals'], gt_bboxes * scale_factor, decimal=4)
+        assert_array_almost_equal(result['scores'],
+                                  np.array([0.994165, 0.9902001]))
+
+        assert result['clip_len'] == 32
+        assert result['frame_interval'] == 2
+        assert len(result['frame_inds']) == 32
+
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            test_mode=True,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file,
+            use_frames=False,
+            timestamp_start=1,
+            start_index=0,
+            multilabel=False,
+            fps=1)
+        # Mock a decord Container
+        ava_dataset.pipeline.transforms[
+            0]._get_video_reader = mock_video_reader
+        # Try to get a sample
+        result = ava_dataset[0]
+        assert result['modality'] == 'RGB'
+        assert result['fps'] == 1
+        assert result['start_index'] == 0
+
+
+class TestAVAKineticsDataset:
+
+    @classmethod
+    def setup_class(cls):
+        cls.data_prefix = osp.normpath(
+            osp.join(osp.dirname(__file__), './../data', 'ava_dataset'))
+        cls.label_file = osp.join(cls.data_prefix, 'action_list.txt')
+        cls.ann_file = osp.join(cls.data_prefix, 'ava_sample.csv')
+        cls.exclude_file = osp.join(cls.data_prefix,
+                                    'ava_excluded_timestamps_sample.csv')
+        cls.proposal_file = osp.join(cls.data_prefix,
+                                     'ava_proposals_sample.pkl')
+        cls.pipeline = [
+            dict(dict(type='SampleAVAFrames', clip_len=32, frame_interval=2))
+        ]
+        cls.proposal = mmengine.load(cls.proposal_file)
+
+    def test_ava_kinetics_dataset(self):
+        register_all_modules()
+        ava_dataset = AVAKineticsDataset(
+            self.ann_file,
+            self.exclude_file,
+            self.pipeline,
+            self.label_file,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file)
+
+        # custom classes
+        ava_dataset = AVAKineticsDataset(
+            self.ann_file,
+            self.exclude_file,
+            self.pipeline,
+            label_file=self.label_file,
+            custom_classes=[17, 79],
+            num_classes=3,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file)
+        # ava_infos = ava_dataset.video_infos
+        target_labels = np.array([1, 2])
+        labels = np.zeros([3])
+        labels[target_labels] = 1.
+        target_labels = labels[None, ...]
+
+        ava_dataset = AVAKineticsDataset(
+            self.ann_file,
+            None,
+            self.pipeline,
+            self.label_file,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file)
+
+        ava_dataset = AVAKineticsDataset(
+            self.ann_file,
+            None,
+            self.pipeline,
+            self.label_file,
+            test_mode=True,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file)
+
+        del ava_dataset
+
+    def test_ava_kinetics_pipeline(self):
+        register_all_modules()
+        target_keys = [
+            'frame_dir', 'video_id', 'timestamp', 'img_key', 'shot_info',
+            'fps', 'filename_tmpl', 'modality', 'start_index',
+            'timestamp_start', 'timestamp_end', 'proposals', 'scores',
+            'frame_inds', 'clip_len', 'frame_interval', 'gt_labels',
+            'gt_bboxes', 'entity_ids'
+        ]
+
+        ava_dataset = AVAKineticsDataset(
+            self.ann_file,
+            self.exclude_file,
+            self.pipeline,
+            self.label_file,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file)
+        result = ava_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
+
+        assert result['filename_tmpl'] == 'img_{:05}.jpg'
+        assert result['modality'] == 'RGB'
+        assert result['start_index'] == 0
+        assert result['timestamp_start'] == 900
+        assert result['timestamp_end'] == 1800
+        assert_array_equal(result['proposals'],
+                           np.array([[0.011, 0.157, 0.655, 0.983]]))
+        assert_array_equal(result['scores'], np.array([0.998163]))
+
+        assert result['clip_len'] == 32
+        assert result['frame_interval'] == 2
+        assert len(result['frame_inds']) == 32
+
+        ava_dataset = AVAKineticsDataset(
+            self.ann_file,
+            None,
+            self.pipeline,
+            self.label_file,
+            test_mode=True,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file)
+        # Try to get a sample
+        result = ava_dataset[0]
+        assert result['filename_tmpl'] == 'img_{:05}.jpg'
+        assert result['modality'] == 'RGB'
+        assert result['start_index'] >= 0
+        assert result['timestamp_start'] > 0
+        assert result['timestamp_end'] > result['timestamp_start']
diff --git a/tests/datasets/test_pose_dataset.py b/tests/datasets/test_pose_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..383d20acd1a6094c401f3cdbdc00a72b8be902bc
--- /dev/null
+++ b/tests/datasets/test_pose_dataset.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+
+from mmaction.datasets import PoseDataset
+from .base import BaseTestDataset
+
+
+class TestPoseDataset(BaseTestDataset):
+
+    def test_pose_dataset(self):
+        ann_file = self.pose_ann_file
+        data_prefix = dict(video='root')
+        dataset = PoseDataset(
+            ann_file=ann_file,
+            pipeline=[],
+            split='train',
+            box_thr=0.5,
+            data_prefix=data_prefix)
+        assert len(dataset) == 100
+        item = dataset[0]
+        assert item['frame_dir'].startswith(data_prefix['video'])
+
+        dataset = PoseDataset(
+            ann_file=ann_file,
+            pipeline=[],
+            split='train',
+            valid_ratio=0.2,
+            box_thr=0.9)
+        assert len(dataset) == 84
+        for item in dataset:
+            assert np.all(item['box_score'][item['anno_inds']] >= 0.9)
+            assert item['valid'][0.9] / item['total_frames'] >= 0.2
+
+        dataset = PoseDataset(
+            ann_file=ann_file,
+            pipeline=[],
+            split='train',
+            valid_ratio=0.3,
+            box_thr=0.7)
+        assert len(dataset) == 87
+        for item in dataset:
+            assert np.all(item['box_score'][item['anno_inds']] >= 0.7)
+            assert item['valid'][0.7] / item['total_frames'] >= 0.3
+
+        with pytest.raises(AssertionError):
+            dataset = PoseDataset(
+                ann_file=ann_file, pipeline=[], valid_ratio=0.2, box_thr=0.55)
diff --git a/tests/datasets/test_rawframe_dataset.py b/tests/datasets/test_rawframe_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..63a5338129121d733535fb7b52fdd9a1867e0653
--- /dev/null
+++ b/tests/datasets/test_rawframe_dataset.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.testing import assert_dict_has_keys
+
+from mmaction.datasets import RawframeDataset
+from mmaction.utils import register_all_modules
+from .base import BaseTestDataset
+
+
+class TestRawframDataset(BaseTestDataset):
+
+    def test_rawframe_dataset(self):
+        rawframe_dataset = RawframeDataset(self.frame_ann_file,
+                                           self.frame_pipeline,
+                                           {'img': self.data_prefix})
+        assert rawframe_dataset.start_index == 1
+
+    def test_rawframe_dataset_with_offset(self):
+        register_all_modules()
+        rawframe_dataset = RawframeDataset(
+            self.frame_ann_file_with_offset,
+            self.frame_pipeline, {'img': self.data_prefix},
+            with_offset=True)
+        assert rawframe_dataset.start_index == 1
+
+    def test_rawframe_dataset_multi_label(self):
+        register_all_modules()
+        rawframe_dataset = RawframeDataset(
+            self.frame_ann_file_multi_label,
+            self.frame_pipeline, {'img': self.data_prefix},
+            multi_class=True,
+            num_classes=100)
+        assert rawframe_dataset.start_index == 1
+
+    def test_dataset_realpath(self):
+        register_all_modules()
+        dataset = RawframeDataset(self.frame_ann_file, self.frame_pipeline,
+                                  {'img': '.'})
+        dataset = RawframeDataset(self.frame_ann_file, self.frame_pipeline,
+                                  {'img': 's3://good'})
+        assert dataset.data_prefix == {'img': 's3://good'}
+
+        dataset = RawframeDataset(self.frame_ann_file, self.frame_pipeline)
+
+    def test_rawframe_pipeline(self):
+        target_keys = [
+            'frame_dir', 'total_frames', 'label', 'filename_tmpl',
+            'start_index', 'modality'
+        ]
+
+        # RawframeDataset not in test mode
+        rawframe_dataset = RawframeDataset(
+            self.frame_ann_file,
+            self.frame_pipeline, {'img': self.data_prefix},
+            test_mode=False)
+        result = rawframe_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
+
+        # RawframeDataset in multi-class tasks
+        rawframe_dataset = RawframeDataset(
+            self.frame_ann_file,
+            self.frame_pipeline, {'img': self.data_prefix},
+            multi_class=True,
+            num_classes=400,
+            test_mode=False)
+        result = rawframe_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
+
+        # RawframeDataset with offset
+        rawframe_dataset = RawframeDataset(
+            self.frame_ann_file_with_offset,
+            self.frame_pipeline, {'img': self.data_prefix},
+            with_offset=True,
+            num_classes=400,
+            test_mode=False)
+        result = rawframe_dataset[0]
+        assert assert_dict_has_keys(result, target_keys + ['offset'])
+
+        # RawframeDataset in test mode
+        rawframe_dataset = RawframeDataset(
+            self.frame_ann_file,
+            self.frame_pipeline, {'img': self.data_prefix},
+            test_mode=True)
+        result = rawframe_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
+
+        # RawframeDataset in multi-class tasks in test mode
+        rawframe_dataset = RawframeDataset(
+            self.frame_ann_file,
+            self.frame_pipeline, {'img': self.data_prefix},
+            multi_class=True,
+            num_classes=400,
+            test_mode=True)
+        result = rawframe_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
+
+        # RawframeDataset with offset
+        rawframe_dataset = RawframeDataset(
+            self.frame_ann_file_with_offset,
+            self.frame_pipeline, {'img': self.data_prefix},
+            with_offset=True,
+            num_classes=400,
+            test_mode=True)
+        result = rawframe_dataset[0]
+        assert assert_dict_has_keys(result, target_keys + ['offset'])
diff --git a/tests/datasets/test_repeataug_dataset.py b/tests/datasets/test_repeataug_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..184092d0c12be3406551d319bc545f8d9c477148
--- /dev/null
+++ b/tests/datasets/test_repeataug_dataset.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+from mmengine.testing import assert_dict_has_keys
+
+from mmaction.datasets import RepeatAugDataset
+from mmaction.utils import register_all_modules
+from .base import BaseTestDataset
+
+
+class TestVideoDataset(BaseTestDataset):
+    register_all_modules()
+
+    def test_video_dataset(self):
+        with pytest.raises(AssertionError):
+            # Currently only support decord backend
+            video_dataset = RepeatAugDataset(
+                self.video_ann_file,
+                self.video_pipeline,
+                data_prefix={'video': self.data_prefix},
+                start_index=3)
+
+        video_pipeline = [
+            dict(type='DecordInit'),
+            dict(
+                type='SampleFrames', clip_len=4, frame_interval=2,
+                num_clips=1),
+            dict(type='DecordDecode')
+        ]
+
+        video_dataset = RepeatAugDataset(
+            self.video_ann_file,
+            video_pipeline,
+            data_prefix={'video': self.data_prefix},
+            start_index=3)
+        assert len(video_dataset) == 2
+        assert video_dataset.start_index == 3
+
+        video_dataset = RepeatAugDataset(
+            self.video_ann_file,
+            video_pipeline,
+            data_prefix={'video': self.data_prefix})
+        assert video_dataset.start_index == 0
+
+    def test_video_dataset_multi_label(self):
+        video_pipeline = [
+            dict(type='DecordInit'),
+            dict(
+                type='SampleFrames', clip_len=4, frame_interval=2,
+                num_clips=1),
+            dict(type='DecordDecode')
+        ]
+        video_dataset = RepeatAugDataset(
+            self.video_ann_file_multi_label,
+            video_pipeline,
+            data_prefix={'video': self.data_prefix},
+            multi_class=True,
+            num_classes=100)
+        assert video_dataset.start_index == 0
+
+    def test_video_pipeline(self):
+        video_pipeline = [
+            dict(type='DecordInit'),
+            dict(
+                type='SampleFrames', clip_len=4, frame_interval=2,
+                num_clips=1),
+            dict(type='DecordDecode')
+        ]
+        target_keys = ['filename', 'label', 'start_index', 'modality']
+
+        # RepeatAugDataset not in test mode
+        video_dataset = RepeatAugDataset(
+            self.video_ann_file,
+            video_pipeline,
+            data_prefix={'video': self.data_prefix})
+        result = video_dataset[0]
+        assert isinstance(result, (list, tuple))
+        assert assert_dict_has_keys(result[0], target_keys)
diff --git a/tests/datasets/test_video_dataset.py b/tests/datasets/test_video_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cfe9dd800a92bab44cbe9d6a57e8108423ce644
--- /dev/null
+++ b/tests/datasets/test_video_dataset.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.testing import assert_dict_has_keys
+
+from mmaction.datasets import VideoDataset
+from mmaction.utils import register_all_modules
+from .base import BaseTestDataset
+
+
+class TestVideoDataset(BaseTestDataset):
+    register_all_modules()
+
+    def test_video_dataset(self):
+        video_dataset = VideoDataset(
+            self.video_ann_file,
+            self.video_pipeline,
+            data_prefix={'video': self.data_prefix},
+            start_index=3)
+        assert len(video_dataset) == 2
+        assert video_dataset.start_index == 3
+
+        video_dataset = VideoDataset(
+            self.video_ann_file,
+            self.video_pipeline,
+            data_prefix={'video': self.data_prefix})
+        assert video_dataset.start_index == 0
+
+    def test_video_dataset_multi_label(self):
+        video_dataset = VideoDataset(
+            self.video_ann_file_multi_label,
+            self.video_pipeline,
+            data_prefix={'video': self.data_prefix},
+            multi_class=True,
+            num_classes=100)
+        assert video_dataset.start_index == 0
+
+    def test_video_pipeline(self):
+        target_keys = ['filename', 'label', 'start_index', 'modality']
+
+        # VideoDataset not in test mode
+        video_dataset = VideoDataset(
+            self.video_ann_file,
+            self.video_pipeline,
+            data_prefix={'video': self.data_prefix},
+            test_mode=False)
+        result = video_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
+
+        # VideoDataset in test mode
+        video_dataset = VideoDataset(
+            self.video_ann_file,
+            self.video_pipeline,
+            data_prefix={'video': self.data_prefix},
+            test_mode=True)
+        result = video_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
diff --git a/tests/datasets/test_video_text_dataset.py b/tests/datasets/test_video_text_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..49f150b867cc5e64b3efe1d6cf7b1914659a0a27
--- /dev/null
+++ b/tests/datasets/test_video_text_dataset.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.testing import assert_dict_has_keys
+
+from mmaction.datasets import VideoTextDataset
+from mmaction.utils import register_all_modules
+from .base import BaseTestDataset
+
+
+class TestVideoTextDataset(BaseTestDataset):
+    register_all_modules()
+
+    def test_video_dataset(self):
+        video_dataset = VideoTextDataset(
+            self.video_text_ann_file,
+            self.video_text_pipeline,
+            data_prefix={'video': self.data_prefix},
+            start_index=3)
+        assert len(video_dataset) == 2
+        assert video_dataset.start_index == 3
+
+        video_dataset = VideoTextDataset(
+            self.video_text_ann_file,
+            self.video_text_pipeline,
+            data_prefix={'video': self.data_prefix})
+        assert video_dataset.start_index == 0
+
+    def test_video_pipeline(self):
+        target_keys = ['filename', 'text', 'start_index', 'modality', 'imgs']
+
+        # VideoTextDataset not in test mode
+        video_dataset = VideoTextDataset(
+            self.video_text_ann_file,
+            self.video_text_pipeline,
+            data_prefix={'video': self.data_prefix},
+            test_mode=False)
+        result = video_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
+
+        # VideoTextDataset in test mode
+        video_dataset = VideoTextDataset(
+            self.video_text_ann_file,
+            self.video_text_pipeline,
+            data_prefix={'video': self.data_prefix},
+            test_mode=True)
+        result = video_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
diff --git a/tests/datasets/transforms/test_formating.py b/tests/datasets/transforms/test_formating.py
new file mode 100644
index 0000000000000000000000000000000000000000..746b1d4ae83775e645f011ec314e2442c0428360
--- /dev/null
+++ b/tests/datasets/transforms/test_formating.py
@@ -0,0 +1,296 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import unittest
+
+import numpy as np
+import pytest
+import torch
+from mmengine.structures import InstanceData
+from mmengine.testing import assert_dict_has_keys
+from numpy.testing import assert_array_equal
+
+from mmaction.datasets.transforms import (FormatAudioShape, FormatGCNInput,
+                                          FormatShape, PackActionInputs,
+                                          Transpose)
+from mmaction.registry import TRANSFORMS
+from mmaction.structures import ActionDataSample
+from mmaction.utils import register_all_modules
+
+register_all_modules()
+
+
+class TestPackActionInputs(unittest.TestCase):
+
+    def test_transform(self):
+        # none input
+        with self.assertRaises(ValueError):
+            results = PackActionInputs()(dict())
+
+        # keypoint input
+        results = dict(keypoint=np.random.randn(2, 300, 17, 3), label=1)
+        transform = PackActionInputs()
+        results = transform(results)
+        self.assertIn('inputs', results)
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+        self.assertEqual(results['inputs'].shape, (2, 300, 17, 3))
+        self.assertEqual(results['data_samples'].gt_label,
+                         torch.LongTensor([1]))
+
+        # heatmap_imgs input
+        results = dict(heatmap_imgs=np.random.randn(2, 17, 56, 56), label=1)
+        transform = PackActionInputs()
+        results = transform(results)
+        self.assertIn('inputs', results)
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+        self.assertEqual(results['inputs'].shape, (2, 17, 56, 56))
+        self.assertEqual(results['data_samples'].gt_label,
+                         torch.LongTensor([1]))
+
+        # audios input
+        results = dict(audios=np.random.randn(3, 1, 128, 80), label=[1])
+        transform = PackActionInputs()
+        results = transform(results)
+        self.assertIn('inputs', results)
+        self.assertIn('data_samples', results)
+        self.assertEqual(results['inputs'].shape, (3, 1, 128, 80))
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+
+        # text input
+        results = dict(text=np.random.randn(77))
+        transform = PackActionInputs()
+        results = transform(results)
+        self.assertIn('inputs', results)
+        self.assertIn('data_samples', results)
+        self.assertEqual(results['inputs'].shape, (77, ))
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+
+        # imgs input with label
+        data = dict(
+            imgs=np.random.randn(2, 256, 256, 3),
+            label=[1],
+            filename='test.txt',
+            original_shape=(256, 256, 3),
+            img_shape=(256, 256, 3),
+            flip_direction='vertical')
+
+        transform = PackActionInputs()
+        results = transform(copy.deepcopy(data))
+        self.assertIn('inputs', results)
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+        self.assertIsInstance(results['data_samples'], ActionDataSample)
+        self.assertEqual(results['data_samples'].img_shape, (256, 256, 3))
+        self.assertEqual(results['data_samples'].gt_label,
+                         torch.LongTensor([1]))
+
+        # Test grayscale image
+        data['imgs'] = data['imgs'].mean(-1)
+        results = transform(copy.deepcopy(data))
+        self.assertIn('inputs', results)
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+        self.assertEqual(results['inputs'].shape, (2, 256, 256))
+
+        # imgs input with gt_bboxes
+        data = dict(
+            imgs=np.random.randn(256, 256, 3),
+            gt_bboxes=np.array([[0, 0, 340, 224]]),
+            gt_labels=[1],
+            proposals=np.array([[0, 0, 340, 224]]),
+            filename='test.txt')
+
+        transform = PackActionInputs()
+        results = transform(copy.deepcopy(data))
+        self.assertIn('inputs', results)
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['data_samples'], ActionDataSample)
+        self.assertIsInstance(results['data_samples'].gt_instances,
+                              InstanceData)
+        self.assertIsInstance(results['data_samples'].proposals, InstanceData)
+
+        # imgs and text input
+        data = dict(
+            imgs=np.random.randn(2, 256, 256, 3), text=np.random.randn(77))
+
+        transform = PackActionInputs(collect_keys=('imgs', 'text'))
+        results = transform(copy.deepcopy(data))
+        self.assertIn('inputs', results)
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['inputs'], dict)
+        self.assertEqual(results['inputs']['imgs'].shape, (2, 256, 256, 3))
+        self.assertEqual(results['inputs']['text'].shape, (77, ))
+
+    def test_repr(self):
+        cfg = dict(
+            type='PackActionInputs', meta_keys=['flip_direction', 'img_shape'])
+        transform = TRANSFORMS.build(cfg)
+        self.assertEqual(
+            repr(transform), 'PackActionInputs(collect_keys=None, '
+            "meta_keys=['flip_direction', 'img_shape'])")
+
+
+class TestPackLocalizationInputs(unittest.TestCase):
+
+    def test_transform(self):
+        # raw_feature input
+        data = dict(
+            raw_feature=np.random.randn(400, 5),
+            gt_bbox=np.array([[0.1, 0.3], [0.375, 0.625]]),
+            filename='test.txt')
+
+        cfg = dict(type='PackLocalizationInputs', keys=('gt_bbox', ))
+        transform = TRANSFORMS.build(cfg)
+        results = transform(copy.deepcopy(data))
+        self.assertIn('inputs', results)
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['data_samples'], ActionDataSample)
+        self.assertIsInstance(results['data_samples'].gt_instances,
+                              InstanceData)
+
+        del data['raw_feature']
+        with self.assertRaises(ValueError):
+            transform(copy.deepcopy(data))
+
+        # bsp_feature input
+        data['bsp_feature'] = np.random.randn(100, 32)
+        results = transform(copy.deepcopy(data))
+        self.assertIn('inputs', results)
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['data_samples'], ActionDataSample)
+        self.assertIsInstance(results['data_samples'].gt_instances,
+                              InstanceData)
+
+    def test_repr(self):
+        cfg = dict(
+            type='PackLocalizationInputs',
+            meta_keys=['video_name', 'feature_frame'])
+        transform = TRANSFORMS.build(cfg)
+        self.assertEqual(
+            repr(transform),
+            "PackLocalizationInputs(meta_keys=['video_name', 'feature_frame'])"
+        )
+
+
+def test_transpose():
+    results = dict(imgs=np.random.randn(256, 256, 3))
+    keys = ['imgs']
+    order = [2, 0, 1]
+    transpose = Transpose(keys, order)
+    results = transpose(results)
+    assert results['imgs'].shape == (3, 256, 256)
+    assert repr(transpose) == transpose.__class__.__name__ + \
+        f'(keys={keys}, order={order})'
+
+
+def test_format_shape():
+    with pytest.raises(ValueError):
+        # invalid input format
+        FormatShape('NHWC')
+
+    # 'NCHW' input format (RGB Modality)
+    results = dict(
+        imgs=np.random.randn(3, 224, 224, 3), num_clips=1, clip_len=3)
+    format_shape = FormatShape('NCHW')
+    assert format_shape(results)['input_shape'] == (3, 3, 224, 224)
+
+    # `NCHW` input format (Flow Modality)
+    results = dict(
+        imgs=np.random.randn(3, 224, 224, 2),
+        num_clips=1,
+        clip_len=3,
+        modality='Flow')
+    format_shape = FormatShape('NCHW')
+    assert format_shape(results)['input_shape'] == (1, 6, 224, 224)
+
+    # `NCTHW` input format with num_clips=1, clip_len=3
+    results = dict(
+        imgs=np.random.randn(3, 224, 224, 3), num_clips=1, clip_len=3)
+    format_shape = FormatShape('NCTHW')
+    assert format_shape(results)['input_shape'] == (1, 3, 3, 224, 224)
+
+    # `NCTHW` input format with num_clips=2, clip_len=3
+    results = dict(
+        imgs=np.random.randn(18, 224, 224, 3), num_clips=2, clip_len=3)
+    assert format_shape(results)['input_shape'] == (6, 3, 3, 224, 224)
+    target_keys = ['imgs', 'input_shape']
+    assert assert_dict_has_keys(results, target_keys)
+
+    # `NCTHW` input format with imgs and heatmap_imgs
+    results = dict(
+        imgs=np.random.randn(6, 224, 224, 3),
+        heatmap_imgs=np.random.randn(12, 17, 56, 56),
+        num_clips=2,
+        clip_len=dict(RGB=3, Pose=6))
+
+    results = format_shape(results)
+    assert results['input_shape'] == (2, 3, 3, 224, 224)
+    assert results['heatmap_input_shape'] == (2, 17, 6, 56, 56)
+
+    assert repr(format_shape) == "FormatShape(input_format='NCTHW')"
+
+    # `NCTHW_Heatmap` input format
+    results = dict(
+        imgs=np.random.randn(12, 17, 56, 56), num_clips=2, clip_len=6)
+    format_shape = FormatShape('NCTHW_Heatmap')
+    assert format_shape(results)['input_shape'] == (2, 17, 6, 56, 56)
+
+    # `NPTCHW` input format
+    results = dict(
+        imgs=np.random.randn(72, 224, 224, 3),
+        num_clips=9,
+        clip_len=1,
+        num_proposals=8)
+    format_shape = FormatShape('NPTCHW')
+    assert format_shape(results)['input_shape'] == (8, 9, 3, 224, 224)
+
+
+def test_format_audio_shape():
+    with pytest.raises(ValueError):
+        # invalid input format
+        FormatAudioShape('XXXX')
+
+    # `NCTF` input format
+    results = dict(audios=np.random.randn(3, 128, 8))
+    format_shape = FormatAudioShape('NCTF')
+    assert format_shape(results)['input_shape'] == (3, 1, 128, 8)
+    assert repr(format_shape) == format_shape.__class__.__name__ + \
+        "(input_format='NCTF')"
+
+
+def test_format_gcn_input():
+    with pytest.raises(AssertionError):
+        FormatGCNInput(mode='invalid')
+
+    results = dict(
+        keypoint=np.random.randn(2, 10, 17, 2),
+        keypoint_score=np.random.randn(2, 10, 17))
+    format_shape = FormatGCNInput(num_person=2, mode='zero')
+    results = format_shape(results)
+    assert results['keypoint'].shape == (1, 2, 10, 17, 3)
+    assert repr(format_shape) == 'FormatGCNInput(num_person=2, mode=zero)'
+
+    results = dict(keypoint=np.random.randn(2, 40, 25, 3), num_clips=4)
+    format_shape = FormatGCNInput(num_person=2, mode='zero')
+    results = format_shape(results)
+    assert results['keypoint'].shape == (4, 2, 10, 25, 3)
+
+    results = dict(keypoint=np.random.randn(1, 10, 25, 3))
+    format_shape = FormatGCNInput(num_person=2, mode='zero')
+    results = format_shape(results)
+    assert results['keypoint'].shape == (1, 2, 10, 25, 3)
+    assert_array_equal(results['keypoint'][:, 1], np.zeros((1, 10, 25, 3)))
+
+    results = dict(keypoint=np.random.randn(1, 10, 25, 3))
+    format_shape = FormatGCNInput(num_person=2, mode='loop')
+    results = format_shape(results)
+    assert results['keypoint'].shape == (1, 2, 10, 25, 3)
+    assert_array_equal(results['keypoint'][:, 1], results['keypoint'][:, 0])
+
+    results = dict(keypoint=np.random.randn(3, 10, 25, 3))
+    format_shape = FormatGCNInput(num_person=2, mode='zero')
+    results = format_shape(results)
+    assert results['keypoint'].shape == (1, 2, 10, 25, 3)
diff --git a/tests/datasets/transforms/test_loading.py b/tests/datasets/transforms/test_loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1e13687709b0c6213a5b87389830788d96c2e90
--- /dev/null
+++ b/tests/datasets/transforms/test_loading.py
@@ -0,0 +1,748 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+import platform
+
+import mmcv
+import numpy as np
+import pytest
+import torch
+from mmengine.testing import assert_dict_has_keys
+from numpy.testing import assert_array_almost_equal
+
+from mmaction.datasets.transforms import (DecordDecode, DecordInit,
+                                          GenerateLocalizationLabels,
+                                          LoadAudioFeature, LoadHVULabel,
+                                          LoadLocalizationFeature,
+                                          LoadProposals, LoadRGBFromFile,
+                                          OpenCVDecode, OpenCVInit, PIMSDecode,
+                                          PIMSInit, PyAVDecode,
+                                          PyAVDecodeMotionVector, PyAVInit)
+
+from mmaction.datasets.transforms import RawFrameDecode  # isort:skip
+
+
+class BaseTestLoading:
+
+    @classmethod
+    def setup_class(cls):
+        cls.data_prefix = osp.normpath(
+            osp.join(osp.dirname(__file__), '../../data'))
+        cls.img_path = osp.join(cls.data_prefix, 'test.jpg')
+        cls.video_path = osp.join(cls.data_prefix, 'test.mp4')
+        cls.wav_path = osp.join(cls.data_prefix, 'test.wav')
+        cls.audio_spec_path = osp.join(cls.data_prefix, 'test.npy')
+        cls.img_dir = osp.join(cls.data_prefix, 'imgs')
+        cls.raw_feature_dir = osp.join(cls.data_prefix, 'activitynet_features')
+        cls.bsp_feature_dir = osp.join(cls.data_prefix, 'bsp_features')
+        cls.proposals_dir = osp.join(cls.data_prefix, 'proposals')
+
+        cls.total_frames = 5
+        cls.filename_tmpl = 'img_{:05}.jpg'
+        cls.flow_filename_tmpl = '{}_{:05d}.jpg'
+        video_total_frames = len(mmcv.VideoReader(cls.video_path))
+        cls.audio_total_frames = video_total_frames
+
+        cls.video_results = dict(
+            filename=cls.video_path,
+            label=1,
+            total_frames=video_total_frames,
+            start_index=0)
+        cls.audio_results = dict(
+            audios=np.random.randn(1280, ),
+            audio_path=cls.wav_path,
+            total_frames=cls.audio_total_frames,
+            label=1,
+            start_index=0)
+        cls.audio_feature_results = dict(
+            audios=np.random.randn(128, 80),
+            audio_path=cls.audio_spec_path,
+            total_frames=cls.audio_total_frames,
+            label=1,
+            start_index=0)
+        cls.frame_results = dict(
+            frame_dir=cls.img_dir,
+            total_frames=cls.total_frames,
+            filename_tmpl=cls.filename_tmpl,
+            start_index=1,
+            modality='RGB',
+            offset=0,
+            label=1)
+        cls.flow_frame_results = dict(
+            frame_dir=cls.img_dir,
+            total_frames=cls.total_frames,
+            filename_tmpl=cls.flow_filename_tmpl,
+            modality='Flow',
+            offset=0,
+            label=1)
+        cls.action_results = dict(
+            video_name='v_test1',
+            data_prefix=cls.raw_feature_dir,
+            temporal_scale=5,
+            boundary_ratio=0.1,
+            duration_second=10,
+            duration_frame=10,
+            feature_frame=8,
+            annotations=[{
+                'segment': [3.0, 5.0],
+                'label': 'Rock climbing'
+            }])
+        cls.action_results['feature_path'] = osp.join(cls.raw_feature_dir,
+                                                      'v_test1.csv')
+
+        cls.ava_results = dict(
+            fps=30, timestamp=902, timestamp_start=840, shot_info=(0, 27000))
+
+        cls.hvu_label_example1 = dict(
+            categories=['action', 'object', 'scene', 'concept'],
+            category_nums=[2, 5, 3, 2],
+            label=dict(action=[0], object=[2, 3], scene=[0, 1]))
+        cls.hvu_label_example2 = dict(
+            categories=['action', 'object', 'scene', 'concept'],
+            category_nums=[2, 5, 3, 2],
+            label=dict(action=[1], scene=[1, 2], concept=[1]))
+
+
+class TestDecode(BaseTestLoading):
+
+    def test_pyav_init(self):
+        target_keys = ['video_reader', 'total_frames']
+        video_result = copy.deepcopy(self.video_results)
+        pyav_init = PyAVInit()
+        pyav_init_result = pyav_init(video_result)
+        assert assert_dict_has_keys(pyav_init_result, target_keys)
+        assert pyav_init_result['total_frames'] == 300
+        assert repr(
+            pyav_init) == f'{pyav_init.__class__.__name__}(io_backend=disk)'
+
+    def test_pyav_decode(self):
+        target_keys = ['frame_inds', 'imgs', 'original_shape']
+
+        # test PyAV with 2 dim input and start_index = 0
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames,
+                                               2)[:, np.newaxis]
+        pyav_init = PyAVInit()
+        pyav_init_result = pyav_init(video_result)
+        video_result['video_reader'] = pyav_init_result['video_reader']
+
+        pyav_decode = PyAVDecode()
+        pyav_decode_result = pyav_decode(video_result)
+        assert assert_dict_has_keys(pyav_decode_result, target_keys)
+        assert pyav_decode_result['original_shape'] == (256, 340)
+        assert np.shape(pyav_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+        assert repr(pyav_decode) == (f'{pyav_decode.__class__.__name__}('
+                                     f'multi_thread={False}, mode=accurate)')
+
+        # test PyAV with 1 dim input and start_index = 0
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames, 5)
+        pyav_init = PyAVInit()
+        pyav_init_result = pyav_init(video_result)
+        video_result['video_reader'] = pyav_init_result['video_reader']
+
+        pyav_decode = PyAVDecode()
+        pyav_decode_result = pyav_decode(video_result)
+        assert assert_dict_has_keys(pyav_decode_result, target_keys)
+        assert pyav_decode_result['original_shape'] == (256, 340)
+        assert np.shape(pyav_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # PyAV with multi thread and start_index = 0
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames, 5)
+        pyav_init = PyAVInit()
+        pyav_init_result = pyav_init(video_result)
+        video_result['video_reader'] = pyav_init_result['video_reader']
+
+        pyav_decode = PyAVDecode(multi_thread=True)
+        pyav_decode_result = pyav_decode(video_result)
+        assert assert_dict_has_keys(pyav_decode_result, target_keys)
+        assert pyav_decode_result['original_shape'] == (256, 340)
+        assert np.shape(pyav_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+        assert repr(pyav_decode) == (f'{pyav_decode.__class__.__name__}('
+                                     f'multi_thread={True}, mode=accurate)')
+
+        # test PyAV with 2 dim input
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(1, self.total_frames,
+                                               2)[:, np.newaxis]
+        pyav_init = PyAVInit()
+        pyav_init_result = pyav_init(video_result)
+        video_result['video_reader'] = pyav_init_result['video_reader']
+
+        pyav_decode = PyAVDecode()
+        pyav_decode_result = pyav_decode(video_result)
+        assert assert_dict_has_keys(pyav_decode_result, target_keys)
+        assert pyav_decode_result['original_shape'] == (256, 340)
+        assert np.shape(pyav_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # test PyAV with 1 dim input
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(1, self.total_frames, 5)
+        pyav_init = PyAVInit()
+        pyav_init_result = pyav_init(video_result)
+        video_result['video_reader'] = pyav_init_result['video_reader']
+
+        pyav_decode = PyAVDecode()
+        pyav_decode_result = pyav_decode(video_result)
+        assert assert_dict_has_keys(pyav_decode_result, target_keys)
+        assert pyav_decode_result['original_shape'] == (256, 340)
+        assert np.shape(pyav_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # PyAV with multi thread
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(1, self.total_frames, 5)
+        pyav_init = PyAVInit()
+        pyav_init_result = pyav_init(video_result)
+        video_result['video_reader'] = pyav_init_result['video_reader']
+
+        pyav_decode = PyAVDecode(multi_thread=True)
+        pyav_decode_result = pyav_decode(video_result)
+        assert assert_dict_has_keys(pyav_decode_result, target_keys)
+        assert pyav_decode_result['original_shape'] == (256, 340)
+        assert np.shape(pyav_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # PyAV with efficient mode
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(1, self.total_frames, 5)
+        pyav_init = PyAVInit()
+        pyav_init_result = pyav_init(video_result)
+        video_result['video_reader'] = pyav_init_result['video_reader']
+
+        pyav_decode = PyAVDecode(multi_thread=True, mode='efficient')
+        pyav_decode_result = pyav_decode(video_result)
+        assert assert_dict_has_keys(pyav_decode_result, target_keys)
+        assert pyav_decode_result['original_shape'] == (256, 340)
+        assert np.shape(pyav_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+        assert pyav_decode_result['video_reader'] is None
+
+        assert (repr(pyav_decode) == pyav_decode.__class__.__name__ +
+                f'(multi_thread={True}, mode=efficient)')
+
+    def test_pims_init(self):
+        target_keys = ['video_reader', 'total_frames']
+        video_result = copy.deepcopy(self.video_results)
+        pims_init = PIMSInit()
+        pims_init_result = pims_init(video_result)
+        assert assert_dict_has_keys(pims_init_result, target_keys)
+        assert pims_init_result['total_frames'] == 300
+
+        pims_init = PIMSInit(mode='efficient')
+        pims_init_result = pims_init(video_result)
+        assert assert_dict_has_keys(pims_init_result, target_keys)
+        assert pims_init_result['total_frames'] == 300
+
+        assert repr(pims_init) == (f'{pims_init.__class__.__name__}'
+                                   f'(io_backend=disk, mode=efficient)')
+
+    def test_pims_decode(self):
+        target_keys = ['frame_inds', 'imgs', 'original_shape']
+
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames,
+                                               2)[:, np.newaxis]
+        pims_init = PIMSInit()
+        pims_init_result = pims_init(video_result)
+
+        pims_decode = PIMSDecode()
+        pims_decode_result = pims_decode(pims_init_result)
+        assert assert_dict_has_keys(pims_decode_result, target_keys)
+        assert pims_decode_result['original_shape'] == (256, 340)
+        assert np.shape(pims_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+    def test_decord_init(self):
+        target_keys = ['video_reader', 'total_frames', 'avg_fps']
+        video_result = copy.deepcopy(self.video_results)
+        decord_init = DecordInit()
+        decord_init_result = decord_init(video_result)
+        assert assert_dict_has_keys(decord_init_result, target_keys)
+        assert decord_init_result['total_frames'] == len(
+            decord_init_result['video_reader'])
+        assert decord_init_result['avg_fps'] == 30
+
+        assert repr(decord_init) == (f'{decord_init.__class__.__name__}('
+                                     f'io_backend=disk, '
+                                     f'num_threads=1)')
+
+    def test_decord_decode(self):
+        target_keys = ['frame_inds', 'imgs', 'original_shape']
+
+        # test Decord with 2 dim input using accurate mode
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames,
+                                               3)[:, np.newaxis]
+        decord_init = DecordInit()
+        decord_init_result = decord_init(video_result)
+        video_result['video_reader'] = decord_init_result['video_reader']
+
+        decord_decode = DecordDecode()
+        decord_decode_result = decord_decode(video_result)
+        assert assert_dict_has_keys(decord_decode_result, target_keys)
+        assert decord_decode_result['original_shape'] == (256, 340)
+        assert np.shape(decord_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # test Decord with 1 dim input using accurate mode
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames, 3)
+        decord_init = DecordInit()
+        decord_init_result = decord_init(video_result)
+        video_result['video_reader'] = decord_init_result['video_reader']
+
+        decord_decode = DecordDecode()
+        decord_decode_result = decord_decode(video_result)
+        assert assert_dict_has_keys(decord_decode_result, target_keys)
+        assert decord_decode_result['original_shape'] == (256, 340)
+        assert np.shape(decord_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # test Decord with 2 dim input using efficient mode
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames,
+                                               3)[:, np.newaxis]
+        decord_init = DecordInit()
+        decord_init_result = decord_init(video_result)
+        video_result['video_reader'] = decord_init_result['video_reader']
+
+        decord_decode = DecordDecode(mode='efficient')
+        decord_decode_result = decord_decode(video_result)
+        assert assert_dict_has_keys(decord_decode_result, target_keys)
+        assert decord_decode_result['original_shape'] == (256, 340)
+        assert np.shape(decord_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # test Decord with 1 dim input using efficient mode
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(1, self.total_frames, 3)
+        decord_init = DecordInit()
+        decord_init_result = decord_init(video_result)
+        video_result['video_reader'] = decord_init_result['video_reader']
+
+        decord_decode = DecordDecode(mode='efficient')
+        decord_decode_result = decord_decode(video_result)
+        assert assert_dict_has_keys(decord_decode_result, target_keys)
+        assert decord_decode_result['original_shape'] == (256, 340)
+        assert np.shape(decord_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+        assert repr(decord_decode) == (f'{decord_decode.__class__.__name__}('
+                                       f'mode=efficient)')
+
+    def test_opencv_init(self):
+        target_keys = ['new_path', 'video_reader', 'total_frames']
+        video_result = copy.deepcopy(self.video_results)
+        opencv_init = OpenCVInit()
+        opencv_init_result = opencv_init(video_result)
+        assert assert_dict_has_keys(opencv_init_result, target_keys)
+        assert opencv_init_result['total_frames'] == len(
+            opencv_init_result['video_reader'])
+        assert repr(opencv_init) == (f'{opencv_init.__class__.__name__}('
+                                     f'io_backend=disk)')
+
+    def test_opencv_decode(self):
+        target_keys = ['frame_inds', 'imgs', 'original_shape']
+
+        # test OpenCV with 2 dim input when start_index = 0
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames,
+                                               2)[:, np.newaxis]
+        opencv_init = OpenCVInit()
+        opencv_init_result = opencv_init(video_result)
+        video_result['video_reader'] = opencv_init_result['video_reader']
+
+        opencv_decode = OpenCVDecode()
+        opencv_decode_result = opencv_decode(video_result)
+        assert assert_dict_has_keys(opencv_decode_result, target_keys)
+        assert opencv_decode_result['original_shape'] == (256, 340)
+        assert np.shape(opencv_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # test OpenCV with 2 dim input
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(1, self.total_frames,
+                                               2)[:, np.newaxis]
+        opencv_init = OpenCVInit()
+        opencv_init_result = opencv_init(video_result)
+        video_result['video_reader'] = opencv_init_result['video_reader']
+
+        opencv_decode = OpenCVDecode()
+        opencv_decode_result = opencv_decode(video_result)
+        assert assert_dict_has_keys(opencv_decode_result, target_keys)
+        assert opencv_decode_result['original_shape'] == (256, 340)
+        assert np.shape(opencv_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # test OpenCV with 1 dim input when start_index = 0
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames, 3)
+        opencv_init = OpenCVInit()
+        opencv_init_result = opencv_init(video_result)
+        video_result['video_reader'] = opencv_init_result['video_reader']
+
+        # test OpenCV with 1 dim input
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(1, self.total_frames, 3)
+        opencv_init = OpenCVInit()
+        opencv_init_result = opencv_init(video_result)
+        video_result['video_reader'] = opencv_init_result['video_reader']
+
+        opencv_decode = OpenCVDecode()
+        opencv_decode_result = opencv_decode(video_result)
+        assert assert_dict_has_keys(opencv_decode_result, target_keys)
+        assert opencv_decode_result['original_shape'] == (256, 340)
+        assert np.shape(opencv_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+    def test_rawframe_decode(self):
+        target_keys = ['frame_inds', 'imgs', 'original_shape', 'modality']
+
+        # test frame selector with 2 dim input
+        inputs = copy.deepcopy(self.frame_results)
+        inputs['frame_inds'] = np.arange(0, self.total_frames, 2)[:,
+                                                                  np.newaxis]
+        # since the test images start with index 1, we plus 1 to frame_inds
+        # in order to pass the CI
+        inputs['frame_inds'] = inputs['frame_inds'] + 1
+
+        inputs['gt_bboxes'] = np.array([[0, 0, 1, 1]])
+        inputs['proposals'] = np.array([[0, 0, 1, 1]])
+        frame_selector = RawFrameDecode(io_backend='disk')
+        results = frame_selector(inputs)
+        assert assert_dict_has_keys(results, target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240,
+                                             320, 3)
+        assert results['original_shape'] == (240, 320)
+
+        # test frame selector with 2 dim input
+        inputs = copy.deepcopy(self.frame_results)
+        inputs['frame_inds'] = np.arange(1, self.total_frames, 2)[:,
+                                                                  np.newaxis]
+        frame_selector = RawFrameDecode(io_backend='disk')
+        results = frame_selector(inputs)
+        assert assert_dict_has_keys(results, target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240,
+                                             320, 3)
+        assert results['original_shape'] == (240, 320)
+
+        # test frame selector with 1 dim input when start_index = 0
+        inputs = copy.deepcopy(self.frame_results)
+        inputs['frame_inds'] = np.arange(0, self.total_frames, 5)
+        # since the test images start with index 1, we plus 1 to frame_inds
+        # in order to pass the CI
+        inputs['frame_inds'] = inputs['frame_inds'] + 1
+        frame_selector = RawFrameDecode(io_backend='disk')
+        results = frame_selector(inputs)
+        assert assert_dict_has_keys(results, target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240,
+                                             320, 3)
+        assert results['original_shape'] == (240, 320)
+
+        # test frame selector with 1 dim input
+        inputs = copy.deepcopy(self.frame_results)
+        inputs['frame_inds'] = np.arange(1, self.total_frames, 5)
+        frame_selector = RawFrameDecode(io_backend='disk')
+        results = frame_selector(inputs)
+        assert assert_dict_has_keys(results, target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240,
+                                             320, 3)
+        assert results['original_shape'] == (240, 320)
+
+        # test frame selector with 1 dim input
+        inputs = copy.deepcopy(self.frame_results)
+        inputs['frame_inds'] = np.arange(0, self.total_frames, 2)
+        # since the test images start with index 1, we plus 1 to frame_inds
+        # in order to pass the CI
+        inputs['frame_inds'] = inputs['frame_inds'] + 1
+        frame_selector = RawFrameDecode(io_backend='disk')
+        results = frame_selector(inputs)
+        assert assert_dict_has_keys(results, target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240,
+                                             320, 3)
+        assert results['original_shape'] == (240, 320)
+
+        # test frame selector with 1 dim input
+        inputs = copy.deepcopy(self.frame_results)
+        inputs['frame_inds'] = np.arange(1, self.total_frames, 2)
+        frame_selector = RawFrameDecode(io_backend='disk')
+        results = frame_selector(inputs)
+        assert assert_dict_has_keys(results, target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240,
+                                             320, 3)
+        assert results['original_shape'] == (240, 320)
+
+        # test frame selector with 1 dim input for flow images
+        inputs = copy.deepcopy(self.flow_frame_results)
+        inputs['frame_inds'] = np.arange(0, self.total_frames, 2)
+        # since the test images start with index 1, we plus 1 to frame_inds
+        # in order to pass the CI
+        inputs['frame_inds'] = inputs['frame_inds'] + 1
+        frame_selector = RawFrameDecode(io_backend='disk')
+        results = frame_selector(inputs)
+        assert assert_dict_has_keys(results, target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240,
+                                             320, 2)
+        assert results['original_shape'] == (240, 320)
+
+        # test frame selector with 1 dim input for flow images
+        inputs = copy.deepcopy(self.flow_frame_results)
+        inputs['frame_inds'] = np.arange(1, self.total_frames, 2)
+        frame_selector = RawFrameDecode(io_backend='disk')
+        results = frame_selector(inputs)
+        assert assert_dict_has_keys(results, target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240,
+                                             320, 2)
+        assert results['original_shape'] == (240, 320)
+
+        return
+        # cannot install turbojpeg for CI
+        if platform.system() != 'Windows':
+            # test frame selector in turbojpeg decoding backend
+            # when start_index = 0
+            inputs = copy.deepcopy(self.frame_results)
+            inputs['frame_inds'] = np.arange(0, self.total_frames, 5)
+            # since the test images start with index 1, we plus 1 to frame_inds
+            # in order to pass the CI
+            inputs['frame_inds'] = inputs['frame_inds'] + 1
+            frame_selector = RawFrameDecode(
+                io_backend='disk', decoding_backend='turbojpeg')
+            results = frame_selector(inputs)
+            assert assert_dict_has_keys(results, target_keys)
+            assert np.shape(results['imgs']) == (len(inputs['frame_inds']),
+                                                 240, 320, 3)
+            assert results['original_shape'] == (240, 320)
+
+            # test frame selector in turbojpeg decoding backend
+            inputs = copy.deepcopy(self.frame_results)
+            inputs['frame_inds'] = np.arange(1, self.total_frames, 5)
+            frame_selector = RawFrameDecode(
+                io_backend='disk', decoding_backend='turbojpeg')
+            results = frame_selector(inputs)
+            assert assert_dict_has_keys(results, target_keys)
+            assert np.shape(results['imgs']) == (len(inputs['frame_inds']),
+                                                 240, 320, 3)
+            assert results['original_shape'] == (240, 320)
+            assert repr(frame_selector) == (
+                f'{frame_selector.__class__.__name__}(io_backend=disk, '
+                f'decoding_backend=turbojpeg)')
+
+    def test_pyav_decode_motion_vector(self):
+        pyav_init = PyAVInit()
+        pyav = PyAVDecodeMotionVector()
+
+        # test pyav with 2-dim input
+        results = {
+            'filename': self.video_path,
+            'frame_inds': np.arange(0, 32, 1)[:, np.newaxis]
+        }
+        results = pyav_init(results)
+        results = pyav(results)
+        target_keys = ['motion_vectors']
+        assert assert_dict_has_keys(results, target_keys)
+
+        # test pyav with 1 dim input
+        results = {
+            'filename': self.video_path,
+            'frame_inds': np.arange(0, 32, 1)
+        }
+        pyav_init = PyAVInit()
+        results = pyav_init(results)
+        pyav = PyAVDecodeMotionVector()
+        results = pyav(results)
+
+        assert assert_dict_has_keys(results, target_keys)
+
+
+class TestLoad(BaseTestLoading):
+
+    def test_load_hvu_label(self):
+        hvu_label_example1 = copy.deepcopy(self.hvu_label_example1)
+        hvu_label_example2 = copy.deepcopy(self.hvu_label_example2)
+        categories = hvu_label_example1['categories']
+        category_nums = hvu_label_example1['category_nums']
+        num_tags = sum(category_nums)
+        num_categories = len(categories)
+
+        loader = LoadHVULabel()
+        assert repr(loader) == (f'{loader.__class__.__name__}('
+                                f'hvu_initialized={False})')
+
+        result1 = loader(hvu_label_example1)
+        label1 = torch.zeros(num_tags)
+        mask1 = torch.zeros(num_tags)
+        category_mask1 = torch.zeros(num_categories)
+
+        assert repr(loader) == (f'{loader.__class__.__name__}('
+                                f'hvu_initialized={True})')
+
+        label1[[0, 4, 5, 7, 8]] = 1.
+        mask1[:10] = 1.
+        category_mask1[:3] = 1.
+
+        assert torch.all(torch.eq(label1, result1['label']))
+        assert torch.all(torch.eq(mask1, result1['mask']))
+        assert torch.all(torch.eq(category_mask1, result1['category_mask']))
+
+        result2 = loader(hvu_label_example2)
+        label2 = torch.zeros(num_tags)
+        mask2 = torch.zeros(num_tags)
+        category_mask2 = torch.zeros(num_categories)
+
+        label2[[1, 8, 9, 11]] = 1.
+        mask2[:2] = 1.
+        mask2[7:] = 1.
+        category_mask2[[0, 2, 3]] = 1.
+
+        assert torch.all(torch.eq(label2, result2['label']))
+        assert torch.all(torch.eq(mask2, result2['mask']))
+        assert torch.all(torch.eq(category_mask2, result2['category_mask']))
+
+    def test_load_localization_feature(self):
+        target_keys = ['raw_feature']
+
+        action_result = copy.deepcopy(self.action_results)
+
+        # test error cases
+        with pytest.raises(TypeError):
+            load_localization_feature = LoadLocalizationFeature(
+                'unsupport_ext')
+
+        # test normal cases
+        load_localization_feature = LoadLocalizationFeature()
+        load_localization_feature_result = load_localization_feature(
+            action_result)
+        assert assert_dict_has_keys(load_localization_feature_result,
+                                    target_keys)
+        assert load_localization_feature_result['raw_feature'].shape == (400,
+                                                                         5)
+        assert repr(load_localization_feature
+                    ) == f'{load_localization_feature.__class__.__name__}'
+
+    def test_load_proposals(self):
+        target_keys = [
+            'bsp_feature', 'tmin', 'tmax', 'tmin_score', 'tmax_score',
+            'reference_temporal_iou'
+        ]
+
+        action_result = copy.deepcopy(self.action_results)
+
+        # test error cases
+        with pytest.raises(NotImplementedError):
+            load_proposals = LoadProposals(5, self.proposals_dir,
+                                           self.bsp_feature_dir,
+                                           'unsupport_ext')
+
+        with pytest.raises(NotImplementedError):
+            load_proposals = LoadProposals(5, self.proposals_dir,
+                                           self.bsp_feature_dir, '.csv',
+                                           'unsupport_ext')
+
+        # test normal cases
+        load_proposals = LoadProposals(5, self.proposals_dir,
+                                       self.bsp_feature_dir)
+        load_proposals_result = load_proposals(action_result)
+        assert assert_dict_has_keys(load_proposals_result, target_keys)
+        assert load_proposals_result['bsp_feature'].shape[0] == 5
+        assert load_proposals_result['tmin'].shape == (5, )
+        assert_array_almost_equal(
+            load_proposals_result['tmin'], np.arange(0.1, 0.6, 0.1), decimal=4)
+        assert load_proposals_result['tmax'].shape == (5, )
+        assert_array_almost_equal(
+            load_proposals_result['tmax'], np.arange(0.2, 0.7, 0.1), decimal=4)
+        assert load_proposals_result['tmin_score'].shape == (5, )
+        assert_array_almost_equal(
+            load_proposals_result['tmin_score'],
+            np.arange(0.95, 0.90, -0.01),
+            decimal=4)
+        assert load_proposals_result['tmax_score'].shape == (5, )
+        assert_array_almost_equal(
+            load_proposals_result['tmax_score'],
+            np.arange(0.96, 0.91, -0.01),
+            decimal=4)
+        assert load_proposals_result['reference_temporal_iou'].shape == (5, )
+        assert_array_almost_equal(
+            load_proposals_result['reference_temporal_iou'],
+            np.arange(0.85, 0.80, -0.01),
+            decimal=4)
+        assert repr(load_proposals) == (
+            f'{load_proposals.__class__.__name__}('
+            f'top_k={5}, '
+            f'pgm_proposals_dir={self.proposals_dir}, '
+            f'pgm_features_dir={self.bsp_feature_dir}, '
+            f'proposal_ext=.csv, '
+            f'feature_ext=.npy)')
+
+    def test_load_audio_feature(self):
+        target_keys = ['audios']
+        inputs = copy.deepcopy(self.audio_feature_results)
+        load_audio_feature = LoadAudioFeature()
+        results = load_audio_feature(inputs)
+        assert assert_dict_has_keys(results, target_keys)
+
+        # test when no audio feature file exists
+        inputs = copy.deepcopy(self.audio_feature_results)
+        inputs['audio_path'] = 'foo/foo/bar.npy'
+        load_audio_feature = LoadAudioFeature()
+        results = load_audio_feature(inputs)
+        assert results['audios'].shape == (640, 80)
+        assert assert_dict_has_keys(results, target_keys)
+        assert repr(load_audio_feature) == (
+            f'{load_audio_feature.__class__.__name__}('
+            f'pad_method=zero)')
+
+
+class TestLocalization(BaseTestLoading):
+
+    def test_generate_localization_label(self):
+        action_result = copy.deepcopy(self.action_results)
+        action_result['raw_feature'] = np.random.randn(400, 5)
+
+        # test default setting
+        target_keys = ['gt_bbox']
+        generate_localization_labels = GenerateLocalizationLabels()
+        generate_localization_labels_result = generate_localization_labels(
+            action_result)
+        assert assert_dict_has_keys(generate_localization_labels_result,
+                                    target_keys)
+
+        assert_array_almost_equal(
+            generate_localization_labels_result['gt_bbox'], [[0.375, 0.625]],
+            decimal=4)
+
+
+class TestLoadImageFromFile:
+
+    def test_load_img(self):
+        data_prefix = osp.join(osp.dirname(__file__), '../../data')
+
+        results = dict(img_path=osp.join(data_prefix, 'test.jpg'))
+        transform = LoadRGBFromFile()
+        results = transform(copy.deepcopy(results))
+        assert results['img_path'] == osp.join(data_prefix, 'test.jpg')
+        assert results['img'].shape == (240, 320, 3)
+        assert results['img'].dtype == np.uint8
+        assert results['img_shape'] == (240, 320)
+        assert results['ori_shape'] == (240, 320)
+        assert repr(transform) == transform.__class__.__name__ + \
+            "(ignore_empty=False, to_float32=False, color_type='color', " + \
+            "imdecode_backend='cv2', io_backend='disk')"
+
+        # to_float32
+        transform = LoadRGBFromFile(to_float32=True)
+        results = transform(copy.deepcopy(results))
+        assert results['img'].dtype == np.float32
+
+        # test load empty
+        fake_img_path = osp.join(data_prefix, 'fake.jpg')
+        results['img_path'] = fake_img_path
+        transform = LoadRGBFromFile(ignore_empty=False)
+        with pytest.raises(FileNotFoundError):
+            transform(copy.deepcopy(results))
+        transform = LoadRGBFromFile(ignore_empty=True)
+        assert transform(copy.deepcopy(results)) is None
diff --git a/tests/datasets/transforms/test_pose_transforms.py b/tests/datasets/transforms/test_pose_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4cde29fdce490ec10061674b42e08a5d7e8e91b
--- /dev/null
+++ b/tests/datasets/transforms/test_pose_transforms.py
@@ -0,0 +1,697 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import copy as cp
+import os.path as osp
+from collections import defaultdict
+
+import numpy as np
+import pytest
+from mmengine.testing import assert_dict_has_keys
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+from mmaction.datasets.transforms import (DecompressPose, GeneratePoseTarget,
+                                          GenSkeFeat, JointToBone,
+                                          MergeSkeFeat, MMCompact, MMDecode,
+                                          MMUniformSampleFrames, PadTo,
+                                          PoseCompact, PoseDecode,
+                                          PreNormalize2D, PreNormalize3D,
+                                          ToMotion, UniformSampleFrames)
+
+
+class TestPoseTransforms:
+
+    @staticmethod
+    def test_decompress_pose():
+
+        def get_mode(arr):
+            cnt = defaultdict(lambda: 0)
+            for num in arr:
+                cnt[num] += 1
+            max_val = max(cnt.values())
+            return [k for k in cnt if cnt[k] == max_val], max_val
+
+        total_frames = 100
+        img_shape = (224, 224)
+        frame_inds = np.random.choice(range(100), size=120)
+        frame_inds.sort()
+        anno_flag = np.random.random(120) > 0.1
+        anno_inds = np.array([i for i, f in enumerate(anno_flag) if f])
+        kp = np.random.random([120, 17, 3])
+        results = dict(
+            frame_inds=frame_inds,
+            keypoint=kp,
+            total_frames=total_frames,
+            img_shape=img_shape)
+
+        inp = cp.deepcopy(results)
+
+        decompress_pose = DecompressPose(squeeze=True, max_person=100)
+
+        assert str(decompress_pose) == (
+            'DecompressPose(squeeze=True, max_person=100)')
+        return_results = decompress_pose(inp)
+        assert return_results['keypoint'].shape[:-1] == \
+               return_results['keypoint_score'].shape
+
+        num_person = return_results['keypoint'].shape[0]
+        num_frame = return_results['keypoint'].shape[1]
+        assert num_person == get_mode(frame_inds)[1]
+        assert num_frame == len(set(frame_inds))
+
+        inp = cp.deepcopy(results)
+        decompress_pose = DecompressPose(squeeze=False, max_person=100)
+        return_results = decompress_pose(inp)
+        assert return_results['keypoint'].shape[:-1] == \
+               return_results['keypoint_score'].shape
+
+        num_person = return_results['keypoint'].shape[0]
+        num_frame = return_results['keypoint'].shape[1]
+        assert num_person == get_mode(frame_inds)[1]
+        assert num_frame == total_frames
+
+        inp = cp.deepcopy(results)
+        inp['anno_inds'] = anno_inds
+        decompress_pose = DecompressPose(squeeze=True, max_person=100)
+        return_results = decompress_pose(inp)
+        assert return_results['keypoint'].shape[:-1] == \
+               return_results['keypoint_score'].shape
+
+        num_person = return_results['keypoint'].shape[0]
+        num_frame = return_results['keypoint'].shape[1]
+        assert num_person == get_mode(frame_inds[anno_inds])[1]
+        assert num_frame == len(set(frame_inds[anno_inds]))
+
+        inp = cp.deepcopy(results)
+        inp['anno_inds'] = anno_inds
+        decompress_pose = DecompressPose(squeeze=True, max_person=2)
+        return_results = decompress_pose(inp)
+        assert return_results['keypoint'].shape[:-1] == \
+               return_results['keypoint_score'].shape
+
+        num_person = return_results['keypoint'].shape[0]
+        num_frame = return_results['keypoint'].shape[1]
+        assert num_person <= 2
+        assert num_frame == len(set(frame_inds[anno_inds]))
+
+    @staticmethod
+    def test_generate_pose_target():
+        img_shape = (64, 64)
+        kp = np.array([[[[24, 24], [40, 40], [24, 40]]]])
+        kpscore = np.array([[[1., 1., 1.]]])
+        kp = np.concatenate([kp] * 8, axis=1)
+        kpscore = np.concatenate([kpscore] * 8, axis=1)
+        results = dict(
+            img_shape=img_shape,
+            keypoint=kp,
+            keypoint_score=kpscore,
+            modality='Pose')
+
+        generate_pose_target = GeneratePoseTarget(
+            sigma=1,
+            with_kp=True,
+            left_kp=(1, ),
+            right_kp=(2, ),
+            left_limb=(0, ),
+            right_limb=(1, ),
+            skeletons=())
+        assert str(generate_pose_target) == ('GeneratePoseTarget(sigma=1, '
+                                             'use_score=True, with_kp=True, '
+                                             'with_limb=False, skeletons=(), '
+                                             'double=False, left_kp=(1,), '
+                                             'right_kp=(2,), left_limb=(0,), '
+                                             'right_limb=(1,), scaling=1.0)')
+        return_results = generate_pose_target(copy.deepcopy(results))
+        assert return_results['imgs'].shape == (8, 3, 64, 64)
+        assert_array_almost_equal(return_results['imgs'][0],
+                                  return_results['imgs'][1])
+
+        results = dict(img_shape=img_shape, keypoint=kp, modality='Pose')
+
+        generate_pose_target = GeneratePoseTarget(sigma=1, with_kp=True)
+        return_results = generate_pose_target(copy.deepcopy(results))
+        assert return_results['imgs'].shape == (8, 3, 64, 64)
+        assert_array_almost_equal(return_results['imgs'][0],
+                                  return_results['imgs'][1])
+
+        generate_pose_target = GeneratePoseTarget(
+            sigma=1,
+            with_kp=False,
+            with_limb=True,
+            skeletons=((0, 1), (1, 2), (0, 2)))
+        return_results = generate_pose_target(copy.deepcopy(results))
+        assert return_results['imgs'].shape == (8, 3, 64, 64)
+        assert_array_almost_equal(return_results['imgs'][0],
+                                  return_results['imgs'][1])
+
+        generate_pose_target = GeneratePoseTarget(
+            sigma=1,
+            with_kp=False,
+            with_limb=True,
+            double=True,
+            left_limb=(0, ),
+            right_limb=(1, ),
+            skeletons=((0, 1), (1, 2), (0, 2)))
+        return_results = generate_pose_target(copy.deepcopy(results))
+        imgs = return_results['imgs']
+        assert imgs.shape == (16, 3, 64, 64)
+        assert_array_almost_equal(imgs[0], imgs[1])
+        assert_array_almost_equal(imgs[:8, 2], imgs[8:, 2, :, ::-1])
+        assert_array_almost_equal(imgs[:8, 0], imgs[8:, 1, :, ::-1])
+        assert_array_almost_equal(imgs[:8, 1], imgs[8:, 0, :, ::-1])
+
+        img_shape = (64, 64)
+        kp = np.array([[[[24, 24], [40, 40], [24, 40]]]])
+        kpscore = np.array([[[0., 0., 0.]]])
+        kp = np.concatenate([kp] * 8, axis=1)
+        kpscore = np.concatenate([kpscore] * 8, axis=1)
+        results = dict(
+            img_shape=img_shape,
+            keypoint=kp,
+            keypoint_score=kpscore,
+            modality='Pose')
+        generate_pose_target = GeneratePoseTarget(
+            sigma=1, with_kp=True, skeletons=())
+        return_results = generate_pose_target(copy.deepcopy(results))
+        assert_array_almost_equal(return_results['imgs'], 0)
+
+        img_shape = (64, 64)
+        kp = np.array([[[[24, 24], [40, 40], [24, 40]]]])
+        kpscore = np.array([[[0., 0., 0.]]])
+        kp = np.concatenate([kp] * 8, axis=1)
+        kpscore = np.concatenate([kpscore] * 8, axis=1)
+        results = dict(
+            img_shape=img_shape,
+            keypoint=kp,
+            keypoint_score=kpscore,
+            modality='Pose')
+        generate_pose_target = GeneratePoseTarget(
+            sigma=1,
+            with_kp=False,
+            with_limb=True,
+            skeletons=((0, 1), (1, 2), (0, 2)))
+        return_results = generate_pose_target(copy.deepcopy(results))
+        assert_array_almost_equal(return_results['imgs'], 0)
+
+        img_shape = (64, 64)
+        kp = np.array([[[[124, 124], [140, 140], [124, 140]]]])
+        kpscore = np.array([[[0., 0., 0.]]])
+        kp = np.concatenate([kp] * 8, axis=1)
+        kpscore = np.concatenate([kpscore] * 8, axis=1)
+        results = dict(
+            img_shape=img_shape,
+            keypoint=kp,
+            keypoint_score=kpscore,
+            modality='Pose')
+        generate_pose_target = GeneratePoseTarget(sigma=1, with_kp=True)
+        return_results = generate_pose_target(copy.deepcopy(results))
+        assert_array_almost_equal(return_results['imgs'], 0)
+
+        img_shape = (64, 64)
+        kp = np.array([[[[124., 124.], [140., 140.], [124., 140.]]]])
+        kpscore = np.array([[[0., 0., 0.]]])
+        kp = np.concatenate([kp] * 8, axis=1)
+        kpscore = np.concatenate([kpscore] * 8, axis=1)
+        results = dict(
+            img_shape=img_shape,
+            keypoint=kp,
+            keypoint_score=kpscore,
+            modality='Pose')
+        generate_pose_target = GeneratePoseTarget(
+            sigma=1,
+            with_kp=False,
+            with_limb=True,
+            skeletons=((0, 1), (1, 2), (0, 2)))
+        return_results = generate_pose_target(results)
+        assert_array_almost_equal(return_results['imgs'], 0)
+
+    @staticmethod
+    def test_pose_compact():
+        results = {}
+        results['img_shape'] = (100, 100)
+        fake_kp = np.zeros([1, 4, 2, 2])
+        fake_kp[:, :, 0] = [10, 10]
+        fake_kp[:, :, 1] = [90, 90]
+        results['keypoint'] = fake_kp
+
+        pose_compact = PoseCompact(
+            padding=0, threshold=0, hw_ratio=None, allow_imgpad=False)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (80, 80)
+        assert str(pose_compact) == (
+            'PoseCompact(padding=0, threshold=0, hw_ratio=None, '
+            'allow_imgpad=False)')
+
+        pose_compact = PoseCompact(
+            padding=0.3, threshold=0, hw_ratio=None, allow_imgpad=False)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (100, 100)
+
+        pose_compact = PoseCompact(
+            padding=0.3, threshold=0, hw_ratio=None, allow_imgpad=True)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (104, 104)
+
+        pose_compact = PoseCompact(
+            padding=0, threshold=100, hw_ratio=None, allow_imgpad=False)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (100, 100)
+
+        pose_compact = PoseCompact(
+            padding=0, threshold=0, hw_ratio=0.75, allow_imgpad=True)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (80, 106)
+
+    @staticmethod
+    def test_pre_normalize3d():
+        target_keys = ['keypoint', 'total_frames', 'body_center']
+
+        results = dict(keypoint=np.random.randn(2, 40, 25, 3), total_frames=40)
+
+        pre_normalize3d = PreNormalize3D(
+            align_center=True, align_spine=True, align_shoulder=False)
+
+        inp = copy.deepcopy(results)
+        ret1 = pre_normalize3d(inp)
+
+        inp = copy.deepcopy(ret1)
+        ret2 = pre_normalize3d(inp)
+
+        assert_array_equal(ret2['body_center'], np.zeros(3))
+        assert_array_equal(ret1['keypoint'], ret2['keypoint'])
+
+        pre_normalize3d = PreNormalize3D(
+            align_center=True, align_spine=False, align_shoulder=True)
+
+        inp = copy.deepcopy(results)
+        ret3 = pre_normalize3d(inp)
+
+        inp = copy.deepcopy(ret3)
+        ret4 = pre_normalize3d(inp)
+
+        assert_array_equal(ret4['body_center'], np.zeros(3))
+        assert_array_equal(ret3['keypoint'], ret4['keypoint'])
+
+        assert assert_dict_has_keys(ret1, target_keys)
+        assert repr(pre_normalize3d) == 'PreNormalize3D(zaxis=[0, 1], ' \
+                                        'xaxis=[8, 4], align_center=True, ' \
+                                        'align_spine=False, ' \
+                                        'align_shoulder=True)'
+
+    @staticmethod
+    def test_pre_normalize2d():
+
+        def check_pose_normalize(origin_kps, target_kps, h, w):
+            target_kps[..., 0] = target_kps[..., 0] * w / 2 + w / 2
+            target_kps[..., 1] = target_kps[..., 1] * h / 2 + h / 2
+            assert_array_almost_equal(origin_kps, target_kps, decimal=4)
+
+        results = dict(
+            keypoint=np.random.randn(1, 40, 17, 2), img_shape=(480, 854))
+        pre_normalize_2d = PreNormalize2D(img_shape=(1080, 1920))
+        inp = copy.deepcopy(results)
+        ret1 = pre_normalize_2d(inp)
+        check_pose_normalize(
+            results['keypoint'], ret1['keypoint'], h=480, w=854)
+
+        results = dict(keypoint=np.random.randn(1, 40, 17, 2))
+        pre_normalize_2d = PreNormalize2D(img_shape=(1080, 1920))
+        inp = copy.deepcopy(results)
+        ret2 = pre_normalize_2d(inp)
+        check_pose_normalize(
+            results['keypoint'], ret2['keypoint'], h=1080, w=1920)
+
+        assert repr(pre_normalize_2d) == \
+               'PreNormalize2D(img_shape=(1080, 1920))'
+
+    @staticmethod
+    def test_joint_to_bone():
+        with pytest.raises(ValueError):
+            JointToBone(dataset='invalid')
+
+        with pytest.raises(AssertionError):
+            JointToBone()(dict(keypoint=np.random.randn(2, 15, 25, 4)))
+
+        results = dict(keypoint=np.random.randn(2, 15, 25, 3))
+        joint_to_bone = JointToBone(dataset='nturgb+d')
+        center_index = 20
+        results = joint_to_bone(results)
+        assert_array_equal(results['keypoint'][..., center_index, :],
+                           np.zeros((2, 15, 3)))
+
+        results = dict(keypoint=np.random.randn(2, 15, 18, 3))
+        joint_to_bone = JointToBone(dataset='openpose')
+        center_index = 0
+        center_score = results['keypoint'][..., center_index, 2]
+        results = joint_to_bone(results)
+        assert_array_equal(results['keypoint'][..., center_index, :2],
+                           np.zeros((2, 15, 2)))
+        assert_array_almost_equal(results['keypoint'][..., center_index, 2],
+                                  center_score)
+
+        results = dict(keypoint=np.random.randn(2, 15, 17, 3))
+        joint_to_bone = JointToBone(dataset='coco')
+        center_index = 0
+        center_score = results['keypoint'][..., center_index, 2]
+        results = joint_to_bone(results)
+        assert_array_equal(results['keypoint'][..., center_index, :2],
+                           np.zeros((2, 15, 2)))
+        assert_array_almost_equal(results['keypoint'][..., center_index, 2],
+                                  center_score)
+
+        results = dict(keypoint=np.random.randn(2, 15, 17, 3))
+        joint_to_bone = JointToBone(dataset='coco', target='bone')
+        results = joint_to_bone(results)
+        assert assert_dict_has_keys(results, ['keypoint', 'bone'])
+        assert repr(joint_to_bone) == 'JointToBone(dataset=coco, target=bone)'
+
+    @staticmethod
+    def test_to_motion():
+        with pytest.raises(AssertionError):
+            ToMotion()(dict(keypoint=np.random.randn(2, 15, 25, 4)))
+
+        with pytest.raises(KeyError):
+            ToMotion(source='j')(dict(keypoint=np.random.randn(2, 15, 25, 4)))
+
+        results = dict(keypoint=np.random.randn(2, 15, 25, 3))
+        to_motion = ToMotion()
+        results = to_motion(results)
+        assert_array_equal(results['motion'][:, -1, :, :], np.zeros(
+            (2, 25, 3)))
+        assert assert_dict_has_keys(results, ['keypoint', 'motion'])
+        assert repr(to_motion) == 'ToMotion(dataset=nturgb+d, ' \
+                                  'source=keypoint, target=motion)'
+
+    @staticmethod
+    def test_merge_ske_feat():
+        with pytest.raises(KeyError):
+            MergeSkeFeat()(dict(b=np.random.randn(2, 15, 25, 3)))
+
+        results = dict(
+            j=np.random.randn(2, 10, 25, 3), b=np.random.randn(2, 10, 25, 3))
+        merge_ske_feat = MergeSkeFeat(feat_list=['j', 'b'])
+        results = merge_ske_feat(results)
+
+        assert assert_dict_has_keys(results, ['keypoint'])
+        assert results['keypoint'].shape == (2, 10, 25, 6)
+        assert repr(merge_ske_feat) == "MergeSkeFeat(feat_list=['j', 'b'], " \
+                                       'target=keypoint, axis=-1)'
+
+    @staticmethod
+    def test_gen_ske_feat():
+        results = dict(keypoint=np.random.randn(1, 10, 25, 3))
+
+        gen_ske_feat = GenSkeFeat(dataset='nturgb+d', feats=['j'])
+        inp = copy.deepcopy(results)
+        ret1 = gen_ske_feat(inp)
+        assert_array_equal(ret1['keypoint'], results['keypoint'])
+
+        gen_ske_feat = GenSkeFeat(
+            dataset='nturgb+d', feats=['j', 'b', 'jm', 'bm'])
+        inp = copy.deepcopy(results)
+        ret2 = gen_ske_feat(inp)
+        assert ret2['keypoint'].shape == (1, 10, 25, 12)
+
+        results = dict(
+            keypoint=np.random.randn(1, 10, 17, 2),
+            keypoint_score=np.random.randn(1, 10, 17))
+        gen_ske_feat = GenSkeFeat(dataset='coco', feats=['j', 'b', 'jm', 'bm'])
+        results = gen_ske_feat(results)
+        assert results['keypoint'].shape == (1, 10, 17, 12)
+        assert assert_dict_has_keys(results, ['keypoint'])
+        assert not assert_dict_has_keys(results, ['j', 'b', 'jm', 'bm'])
+        assert repr(gen_ske_feat) == 'GenSkeFeat(dataset=coco, ' \
+                                     "feats=['j', 'b', 'jm', 'bm'], axis=-1)"
+
+    @staticmethod
+    def test_uniform_sample_frames():
+        results = dict(total_frames=64, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=True, seed=0)
+
+        assert repr(sampling) == ('UniformSampleFrames(clip_len=8, '
+                                  'num_clips=1, test_mode=True, seed=0)')
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert_array_equal(sampling_results['frame_inds'],
+                           np.array([4, 15, 21, 24, 35, 43, 51, 63]))
+
+        results = dict(total_frames=15, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=True, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert_array_equal(sampling_results['frame_inds'],
+                           np.array([0, 2, 4, 6, 8, 9, 11, 13]))
+
+        results = dict(total_frames=7, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=True, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert_array_equal(sampling_results['frame_inds'],
+                           np.array([0, 1, 2, 3, 4, 5, 6, 0]))
+
+        results = dict(total_frames=7, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=8, test_mode=True, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 8
+        assert len(sampling_results['frame_inds']) == 64
+
+        results = dict(total_frames=64, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=4, test_mode=True, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 4
+        assert_array_equal(
+            sampling_results['frame_inds'],
+            np.array([
+                4, 15, 21, 24, 35, 43, 51, 63, 1, 11, 21, 26, 36, 47, 54, 56,
+                0, 12, 18, 25, 38, 47, 55, 62, 0, 9, 21, 25, 37, 40, 49, 60
+            ]))
+
+        results = dict(total_frames=64, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=False, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert len(sampling_results['frame_inds']) == 8
+
+        results = dict(total_frames=7, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=False, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert len(sampling_results['frame_inds']) == 8
+
+        results = dict(total_frames=15, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=False, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert len(sampling_results['frame_inds']) == 8
+
+    @staticmethod
+    def test_pad_to():
+        with pytest.raises(AssertionError):
+            PadTo(length=4, mode='invalid')
+
+        results = dict(
+            keypoint=np.random.randn(2, 3, 17, 3),
+            total_frames=3,
+            start_index=0)
+
+        inp = copy.deepcopy(results)
+        pad_to = PadTo(length=6, mode='loop')
+        ret1 = pad_to(inp)
+        kp = ret1['keypoint']
+        assert_array_equal(kp[:, :3], kp[:, 3:])
+
+        inp = copy.deepcopy(results)
+        pad_to = PadTo(length=6, mode='zero')
+        ret2 = pad_to(inp)
+        kp = ret2['keypoint']
+        assert ret2['total_frames'] == 6
+        assert_array_equal(kp[:, 3:], np.zeros((2, 3, 17, 3)))
+
+    @staticmethod
+    def test_pose_decode():
+        kp = np.random.random([1, 16, 17, 2])
+        kpscore = np.random.random([1, 16, 17])
+        frame_inds = np.array([2, 4, 6, 8, 10])
+        results = dict(
+            keypoint=kp, keypoint_score=kpscore, frame_inds=frame_inds)
+        pose_decode = PoseDecode()
+        assert repr(pose_decode) == 'PoseDecode()'
+        decode_results = pose_decode(results)
+        assert_array_almost_equal(decode_results['keypoint'], kp[:,
+                                                                 frame_inds])
+        assert_array_almost_equal(decode_results['keypoint_score'],
+                                  kpscore[:, frame_inds])
+
+        results = dict(keypoint=kp, keypoint_score=kpscore, total_frames=16)
+        pose_decode = PoseDecode()
+        decode_results = pose_decode(results)
+        assert_array_almost_equal(decode_results['keypoint'], kp)
+        assert_array_almost_equal(decode_results['keypoint_score'], kpscore)
+
+    @staticmethod
+    def test_mm_uniform_sample_frames():
+        results = dict(total_frames=64, modality='Pose')
+        sampling = MMUniformSampleFrames(
+            clip_len=dict(RGB=8, Pose=32), num_clips=1, test_mode=True, seed=0)
+        assert repr(sampling) == ('MMUniformSampleFrames('
+                                  "clip_len={'RGB': 8, 'Pose': 32}, "
+                                  'num_clips=1, test_mode=True, seed=0)')
+
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == dict(RGB=8, Pose=32)
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert sampling_results['modality'] == ['RGB', 'Pose']
+        assert_array_equal(sampling_results['RGB_inds'],
+                           np.array([4, 15, 21, 24, 35, 43, 51, 63]))
+        assert_array_equal(
+            sampling_results['Pose_inds'],
+            np.array([
+                0, 3, 5, 6, 9, 11, 13, 15, 17, 19, 21, 22, 24, 27, 28, 30, 32,
+                34, 36, 39, 40, 43, 45, 46, 48, 51, 53, 55, 57, 58, 61, 62
+            ]))
+
+        results = dict(total_frames=64, modality='Pose')
+        sampling = MMUniformSampleFrames(
+            clip_len=dict(RGB=8, Pose=32),
+            num_clips=10,
+            test_mode=True,
+            seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == dict(RGB=8, Pose=32)
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 10
+        assert sampling_results['modality'] == ['RGB', 'Pose']
+        assert len(sampling_results['RGB_inds']) == 80
+        assert len(sampling_results['Pose_inds']) == 320
+
+        results = dict(total_frames=64, modality='Pose')
+        sampling = MMUniformSampleFrames(
+            clip_len=dict(RGB=8, Pose=32), num_clips=1, test_mode=False)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == dict(RGB=8, Pose=32)
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert len(sampling_results['RGB_inds']) == 8
+        assert len(sampling_results['Pose_inds']) == 32
+
+    @staticmethod
+    def test_mm_decode():
+        mm_decode = MMDecode()
+
+        # Pose only test
+        pose_raw_results = dict(
+            modality=['Pose'],
+            Pose_inds=np.array([2, 4, 6, 8, 10]),
+            keypoint=np.random.random([1, 16, 17, 2]),
+            img_shape=(1080, 1920))
+        rgb_raw_results = dict(
+            modality=['RGB'],
+            RGB_inds=np.array([2, 4, 6, 8, 10]),
+            frame_dir=osp.join(osp.dirname(__file__), '../../data/test'))
+
+        # test pose w/o `keypoint_score`
+        mm_decode(copy.deepcopy(pose_raw_results))
+
+        # test pose with `keypoint_score`
+        pose_raw_results['keypoint_score'] = np.random.random([1, 16, 17])
+        pose_results = mm_decode(copy.deepcopy(pose_raw_results))
+
+        # test rgb
+        rgb_results = mm_decode(copy.deepcopy(rgb_raw_results))
+
+        # test pose and rgb
+        pose_rgb_raw_results = {
+            **rgb_raw_results,
+            **pose_raw_results, 'modality': ['RGB', 'Pose']
+        }
+        pose_rgb_results = mm_decode(copy.deepcopy(pose_rgb_raw_results))
+
+        assert_array_equal(pose_rgb_results['keypoint_score'],
+                           pose_results['keypoint_score'])
+        scaled_keypoint = copy.deepcopy(pose_results['keypoint'])
+        oh, ow = pose_results['img_shape']
+        nh, nw = pose_rgb_results['img_shape']
+        scaled_keypoint[..., 0] *= (nw / ow)
+        scaled_keypoint[..., 1] *= (nh / oh)
+        assert_array_equal(pose_rgb_results['keypoint'], scaled_keypoint)
+        assert_array_equal(pose_rgb_results['imgs'], rgb_results['imgs'])
+        assert assert_dict_has_keys(
+            pose_rgb_results, ['filename', 'img_shape', 'original_shape'])
+        assert repr(mm_decode) == 'MMDecode(io_backend=disk)'
+
+    @staticmethod
+    def test_mm_compact():
+        results = {}
+        results['img_shape'] = (100, 100)
+        fake_kp = np.zeros([1, 4, 2, 2])
+        fake_kp[:, :, 0] = [10, 10]
+        fake_kp[:, :, 1] = [90, 90]
+        results['keypoint'] = fake_kp
+        results['imgs'] = list(np.zeros([3, 100, 100, 3]))
+
+        pose_compact = MMCompact(
+            padding=0, threshold=0, hw_ratio=1, allow_imgpad=False)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (80, 80)
+        assert ret['imgs'][0].shape[:-1] == (80, 80)
+        assert str(pose_compact) == (
+            'MMCompact(padding=0, threshold=0, hw_ratio=(1, 1), '
+            'allow_imgpad=False)')
+
+        pose_compact = MMCompact(
+            padding=0.3, threshold=0, hw_ratio=1, allow_imgpad=False)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (100, 100)
+        assert ret['imgs'][0].shape[:-1] == (100, 100)
+
+        pose_compact = MMCompact(
+            padding=0.3, threshold=0, hw_ratio=1, allow_imgpad=True)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (104, 104)
+        assert ret['imgs'][0].shape[:-1] == (104, 104)
+
+        pose_compact = MMCompact(
+            padding=0, threshold=100, hw_ratio=1, allow_imgpad=False)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (100, 100)
+        assert ret['imgs'][0].shape[:-1] == (100, 100)
+
+        pose_compact = MMCompact(
+            padding=0, threshold=0, hw_ratio=0.75, allow_imgpad=True)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (80, 106)
+        assert ret['imgs'][0].shape[:-1] == (80, 106)
diff --git a/tests/datasets/transforms/test_processing.py b/tests/datasets/transforms/test_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c44715379fede42f8a7ade6392f2f2f41245943
--- /dev/null
+++ b/tests/datasets/transforms/test_processing.py
@@ -0,0 +1,872 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import mmcv
+import numpy as np
+import pytest
+from mmengine.testing import assert_dict_has_keys
+from numpy.testing import assert_array_almost_equal
+
+from mmaction.datasets.transforms import (CenterCrop, ColorJitter, Flip, Fuse,
+                                          MultiScaleCrop, RandomCrop,
+                                          RandomResizedCrop, Resize, TenCrop,
+                                          ThreeCrop)
+
+
+def check_crop(origin_imgs, result_imgs, result_bbox, num_crops=1):
+    """Check if the result_bbox is in correspond to result_imgs."""
+
+    def check_single_crop(origin_imgs, result_imgs, result_bbox):
+        result_img_shape = result_imgs[0].shape[:2]
+        crop_w = result_bbox[2] - result_bbox[0]
+        crop_h = result_bbox[3] - result_bbox[1]
+        crop_shape = (crop_h, crop_w)
+        if not crop_shape == result_img_shape:
+            return False
+        left, top, right, bottom = result_bbox
+        return np.array_equal(
+            np.array(origin_imgs)[:, top:bottom, left:right, :],
+            np.array(result_imgs))
+
+    if result_bbox.ndim == 1:
+        return check_single_crop(origin_imgs, result_imgs, result_bbox)
+    if result_bbox.ndim == 2:
+        num_batch = len(origin_imgs)
+        for i, bbox in enumerate(result_bbox):
+            if num_crops == 10:
+                if (i // num_batch) % 2 == 0:
+                    flag = check_single_crop([origin_imgs[i % num_batch]],
+                                             [result_imgs[i]], bbox)
+                else:
+                    flag = check_single_crop([origin_imgs[i % num_batch]],
+                                             [np.flip(result_imgs[i], axis=1)],
+                                             bbox)
+            else:
+                flag = check_single_crop([origin_imgs[i % num_batch]],
+                                         [result_imgs[i]], bbox)
+            if not flag:
+                return False
+        return True
+    else:
+        # bbox has a wrong dimension
+        return False
+
+
+def check_flip(origin_imgs, result_imgs, flip_type):
+    """Check if the origin_imgs are flipped correctly into result_imgs in
+    different flip_types."""
+    n, _, _, _ = np.shape(origin_imgs)
+    if flip_type == 'horizontal':
+        for i in range(n):
+            if np.any(result_imgs[i] != np.fliplr(origin_imgs[i])):
+                return False
+    else:
+        # yapf: disable
+        for i in range(n):
+            if np.any(result_imgs[i] != np.transpose(np.fliplr(np.transpose(origin_imgs[i], (1, 0, 2))), (1, 0, 2))):  # noqa:E501
+                return False
+        # yapf: enable
+    return True
+
+
+class TestColor:
+
+    @staticmethod
+    def test_color_jitter():
+        imgs = list(
+            np.random.randint(0, 255, size=(3, 112, 112, 3), dtype=np.uint8))
+        results = dict(imgs=imgs)
+
+        color_jitter = ColorJitter()
+        assert color_jitter.brightness == (0.5, 1.5)
+        assert color_jitter.contrast == (0.5, 1.5)
+        assert color_jitter.saturation == (0.5, 1.5)
+        assert color_jitter.hue == (-0.1, 0.1)
+
+        color_jitter_results = color_jitter(results)
+        target_keys = ['imgs']
+
+        assert assert_dict_has_keys(color_jitter_results, target_keys)
+        assert np.shape(color_jitter_results['imgs']) == (3, 112, 112, 3)
+        for img in color_jitter_results['imgs']:
+            assert np.all(img >= 0)
+            assert np.all(img <= 255)
+
+        assert repr(color_jitter) == (f'{color_jitter.__class__.__name__}('
+                                      f'brightness={(0.5, 1.5)}, '
+                                      f'contrast={(0.5, 1.5)}, '
+                                      f'saturation={(0.5, 1.5)}, '
+                                      f'hue={-0.1, 0.1})')
+
+
+class TestCrops:
+
+    @staticmethod
+    def test_random_crop():
+        with pytest.raises(TypeError):
+            # size must be an int
+            RandomCrop(size=(112, 112))
+        with pytest.raises(AssertionError):
+            # "size > height" or "size > width" is not allowed
+            imgs = list(np.random.rand(2, 224, 341, 3))
+            results = dict(imgs=imgs)
+            random_crop = RandomCrop(size=320)
+            random_crop(results)
+
+        target_keys = ['imgs', 'crop_bbox', 'img_shape']
+
+        # General case
+        imgs = list(np.random.rand(2, 224, 341, 3))
+        results = dict(imgs=imgs)
+        random_crop = RandomCrop(size=224)
+        results['gt_bboxes'] = np.array([[0, 0, 340, 224]])
+        results['proposals'] = np.array([[0, 0, 340, 224]])
+        kp = np.array([[160, 120], [160, 120]]).reshape([1, 1, 2, 2])
+        results['keypoint'] = kp
+        random_crop_result = random_crop(results)
+        assert assert_dict_has_keys(random_crop_result, target_keys)
+        assert check_crop(imgs, random_crop_result['imgs'],
+                          results['crop_bbox'])
+        h, w = random_crop_result['img_shape']
+        assert h == w == 224
+
+        # Test the case that no need for cropping
+        imgs = list(np.random.rand(2, 224, 224, 3))
+        results = dict(imgs=imgs)
+        random_crop = RandomCrop(size=224)
+        random_crop_result = random_crop(results)
+        assert assert_dict_has_keys(random_crop_result, target_keys)
+        assert check_crop(imgs, random_crop_result['imgs'],
+                          results['crop_bbox'])
+        h, w = random_crop_result['img_shape']
+        assert h == w == 224
+
+        # Test the one-side-equal case
+        imgs = list(np.random.rand(2, 224, 225, 3))
+        results = dict(imgs=imgs)
+        random_crop = RandomCrop(size=224)
+        random_crop_result = random_crop(results)
+        assert assert_dict_has_keys(random_crop_result, target_keys)
+        assert check_crop(imgs, random_crop_result['imgs'],
+                          results['crop_bbox'])
+        h, w = random_crop_result['img_shape']
+        assert h == w == 224
+
+        assert repr(random_crop) == (f'{random_crop.__class__.__name__}'
+                                     f'(size={224}, lazy={False})')
+
+    @staticmethod
+    def test_random_resized_crop():
+        with pytest.raises(TypeError):
+            # area_range must be a tuple of float
+            RandomResizedCrop(area_range=0.5)
+        with pytest.raises(TypeError):
+            # aspect_ratio_range must be a tuple of float
+            RandomResizedCrop(area_range=(0.08, 1.0), aspect_ratio_range=0.1)
+
+        target_keys = ['imgs', 'crop_bbox', 'img_shape']
+        # There will be a slight difference because of rounding
+        eps = 0.01
+        imgs = list(np.random.rand(2, 256, 341, 3))
+        results = dict(imgs=imgs)
+        results['gt_bboxes'] = np.array([[0, 0, 340, 256]])
+        results['proposals'] = np.array([[0, 0, 340, 256]])
+        kp = np.array([[160, 120], [160, 120]]).reshape([1, 1, 2, 2])
+        results['keypoint'] = kp
+
+        with pytest.raises(AssertionError):
+            # area_range[0] > area_range[1], which is wrong
+            random_crop = RandomResizedCrop(area_range=(0.9, 0.7))
+            random_crop(results)
+        with pytest.raises(AssertionError):
+            # 0 > area_range[0] and area_range[1] > 1, which is wrong
+            random_crop = RandomResizedCrop(aspect_ratio_range=(-0.1, 2.0))
+            random_crop(results)
+
+        random_crop = RandomResizedCrop()
+        random_crop_result = random_crop(results)
+        assert assert_dict_has_keys(random_crop_result, target_keys)
+        assert check_crop(imgs, random_crop_result['imgs'],
+                          results['crop_bbox'])
+        h, w = random_crop_result['img_shape']
+        assert ((0.08 - eps <= h * w / 256 / 341)
+                and (h * w / 256 / 341 <= 1 + eps))
+        assert (3. / 4. - eps <= h / w) and (h / w - eps <= 4. / 3.)
+        assert repr(random_crop) == (f'{random_crop.__class__.__name__}'
+                                     f'(area_range={(0.08, 1.0)}, '
+                                     f'aspect_ratio_range={(3 / 4, 4 / 3)}, '
+                                     f'lazy={False})')
+
+        random_crop = RandomResizedCrop(
+            area_range=(0.9, 0.9), aspect_ratio_range=(10.0, 10.1))
+        # Test fallback cases by very big area range
+        imgs = list(np.random.rand(2, 256, 341, 3))
+        results = dict(imgs=imgs)
+        random_crop_result = random_crop(results)
+        assert assert_dict_has_keys(random_crop_result, target_keys)
+        assert check_crop(imgs, random_crop_result['imgs'],
+                          results['crop_bbox'])
+        h, w = random_crop_result['img_shape']
+        assert h == w == 256
+
+    @staticmethod
+    def test_multi_scale_crop():
+        with pytest.raises(TypeError):
+            # input_size must be int or tuple of int
+            MultiScaleCrop(0.5)
+
+        with pytest.raises(TypeError):
+            # input_size must be int or tuple of int
+            MultiScaleCrop('224')
+
+        with pytest.raises(TypeError):
+            # scales must be tuple.
+            MultiScaleCrop(
+                224, scales=[
+                    1,
+                ])
+
+        with pytest.raises(ValueError):
+            # num_fix_crops must be in [5, 13]
+            MultiScaleCrop(224, num_fixed_crops=6)
+
+        target_keys = ['imgs', 'crop_bbox', 'img_shape', 'scales']
+
+        # MultiScaleCrop with normal crops.
+        imgs = list(np.random.rand(2, 256, 341, 3))
+        results = dict(imgs=imgs)
+        results['gt_bboxes'] = np.array([[0, 0, 340, 256]])
+        results['proposals'] = np.array([[0, 0, 340, 256]])
+        kp = np.array([[160, 120], [160, 120]]).reshape([1, 1, 2, 2])
+        results['keypoint'] = kp
+        config = dict(
+            input_size=224,
+            scales=(1, 0.8),
+            random_crop=False,
+            max_wh_scale_gap=0)
+        multi_scale_crop = MultiScaleCrop(**config)
+        multi_scale_crop_results = multi_scale_crop(results)
+        assert assert_dict_has_keys(multi_scale_crop_results, target_keys)
+        assert check_crop(imgs, multi_scale_crop_results['imgs'],
+                          multi_scale_crop_results['crop_bbox'])
+        assert multi_scale_crop_results['img_shape'] in [(256, 256),
+                                                         (204, 204)]
+
+        # MultiScaleCrop with more fixed crops.
+        imgs = list(np.random.rand(2, 256, 341, 3))
+        results = dict(imgs=imgs)
+        config = dict(
+            input_size=224,
+            scales=(1, 0.8),
+            random_crop=False,
+            max_wh_scale_gap=0,
+            num_fixed_crops=13)
+        multi_scale_crop = MultiScaleCrop(**config)
+        multi_scale_crop_results = multi_scale_crop(results)
+        assert assert_dict_has_keys(multi_scale_crop_results, target_keys)
+        assert check_crop(imgs, multi_scale_crop_results['imgs'],
+                          multi_scale_crop_results['crop_bbox'])
+        assert multi_scale_crop_results['img_shape'] in [(256, 256),
+                                                         (204, 204)]
+
+        # MultiScaleCrop with random crop.
+        imgs = list(np.random.rand(2, 256, 341, 3))
+        results = dict(imgs=imgs)
+        config = dict(
+            input_size=224,
+            scales=(1, 0.8),
+            random_crop=True,
+            max_wh_scale_gap=0)
+        multi_scale_crop = MultiScaleCrop(**config)
+        multi_scale_crop_results = multi_scale_crop(results)
+        assert assert_dict_has_keys(multi_scale_crop_results, target_keys)
+        assert check_crop(imgs, multi_scale_crop_results['imgs'],
+                          multi_scale_crop_results['crop_bbox'])
+        assert (multi_scale_crop_results['img_shape'] in [(256, 256),
+                                                          (204, 204)])
+
+        assert repr(multi_scale_crop) == (
+            f'{multi_scale_crop.__class__.__name__}'
+            f'(input_size={(224, 224)}, scales={(1, 0.8)}, '
+            f'max_wh_scale_gap={0}, random_crop={True}, '
+            f'num_fixed_crops=5, lazy={False})')
+
+    @staticmethod
+    def test_center_crop():
+        with pytest.raises(TypeError):
+            # crop_size must be int or tuple of int
+            CenterCrop(0.5)
+
+        with pytest.raises(TypeError):
+            # crop_size must be int or tuple of int
+            CenterCrop('224')
+
+        # center crop with crop_size 224
+        # add kps in test_center_crop
+        imgs = list(np.random.rand(2, 240, 320, 3))
+        results = dict(imgs=imgs)
+        kp = np.array([[160, 120], [160, 120]]).reshape([1, 1, 2, 2])
+        results['keypoint'] = kp
+
+        results['gt_bboxes'] = np.array([[0, 0, 320, 240]])
+        results['proposals'] = np.array([[0, 0, 320, 240]])
+        center_crop = CenterCrop(crop_size=224)
+        center_crop_results = center_crop(results)
+        target_keys = ['imgs', 'crop_bbox', 'img_shape', 'keypoint']
+        assert assert_dict_has_keys(center_crop_results, target_keys)
+        assert check_crop(imgs, center_crop_results['imgs'],
+                          center_crop_results['crop_bbox'])
+        assert np.all(
+            center_crop_results['crop_bbox'] == np.array([48, 8, 272, 232]))
+        assert center_crop_results['img_shape'] == (224, 224)
+        assert np.all(center_crop_results['keypoint'] == 112)
+
+        assert repr(center_crop) == (f'{center_crop.__class__.__name__}'
+                                     f'(crop_size={(224, 224)}, lazy={False})')
+
+    @staticmethod
+    def test_three_crop():
+        with pytest.raises(TypeError):
+            # crop_size must be int or tuple of int
+            ThreeCrop(0.5)
+
+        with pytest.raises(TypeError):
+            # crop_size must be int or tuple of int
+            ThreeCrop('224')
+
+        # three crop with crop_size 120
+        imgs = list(np.random.rand(2, 240, 120, 3))
+        results = dict(imgs=imgs)
+        three_crop = ThreeCrop(crop_size=120)
+        three_crop_results = three_crop(results)
+        target_keys = ['imgs', 'crop_bbox', 'img_shape']
+        assert assert_dict_has_keys(three_crop_results, target_keys)
+        assert check_crop(imgs, three_crop_results['imgs'],
+                          three_crop_results['crop_bbox'], 3)
+        assert three_crop_results['img_shape'] == (120, 120)
+
+        # three crop with crop_size 224
+        imgs = list(np.random.rand(2, 224, 224, 3))
+        results = dict(imgs=imgs)
+        three_crop = ThreeCrop(crop_size=224)
+        three_crop_results = three_crop(results)
+        target_keys = ['imgs', 'crop_bbox', 'img_shape']
+        assert assert_dict_has_keys(three_crop_results, target_keys)
+        assert check_crop(imgs, three_crop_results['imgs'],
+                          three_crop_results['crop_bbox'], 3)
+        assert three_crop_results['img_shape'] == (224, 224)
+
+        assert repr(three_crop) == (f'{three_crop.__class__.__name__}'
+                                    f'(crop_size={(224, 224)})')
+
+    @staticmethod
+    def test_ten_crop():
+        with pytest.raises(TypeError):
+            # crop_size must be int or tuple of int
+            TenCrop(0.5)
+
+        with pytest.raises(TypeError):
+            # crop_size must be int or tuple of int
+            TenCrop('224')
+
+        # ten crop with crop_size 256
+        imgs = list(np.random.rand(2, 256, 256, 3))
+        results = dict(imgs=imgs)
+        ten_crop = TenCrop(crop_size=224)
+        ten_crop_results = ten_crop(results)
+        target_keys = ['imgs', 'crop_bbox', 'img_shape']
+        assert assert_dict_has_keys(ten_crop_results, target_keys)
+        assert check_crop(imgs, ten_crop_results['imgs'],
+                          ten_crop_results['crop_bbox'], 10)
+        assert ten_crop_results['img_shape'] == (224, 224)
+
+        assert repr(ten_crop) == (f'{ten_crop.__class__.__name__}'
+                                  f'(crop_size={(224, 224)})')
+
+
+class TestFlip:
+
+    @staticmethod
+    def test_flip():
+        with pytest.raises(ValueError):
+            # direction must be in ['horizontal', 'vertical']
+            Flip(direction='vertically')
+
+        target_keys = ['imgs', 'flip_direction', 'modality']
+
+        # do not flip imgs.
+        imgs = list(np.random.rand(2, 64, 64, 3))
+        results = dict(imgs=copy.deepcopy(imgs), modality='RGB')
+        flip = Flip(flip_ratio=0, direction='horizontal')
+        flip_results = flip(results)
+        assert assert_dict_has_keys(flip_results, target_keys)
+        assert np.array_equal(imgs, results['imgs'])
+        assert id(flip_results['imgs']) == id(results['imgs'])
+        assert np.shape(flip_results['imgs']) == np.shape(imgs)
+
+        # always flip imgs horizontally.
+        imgs = list(np.random.rand(2, 64, 64, 3))
+        results = dict(imgs=copy.deepcopy(imgs), modality='RGB')
+        results['gt_bboxes'] = np.array([[0, 0, 60, 60]])
+        results['proposals'] = np.array([[0, 0, 60, 60]])
+        flip = Flip(flip_ratio=1, direction='horizontal')
+        flip_results = flip(results)
+        assert assert_dict_has_keys(flip_results, target_keys)
+        if flip_results['flip'] is True:
+            assert check_flip(imgs, flip_results['imgs'],
+                              flip_results['flip_direction'])
+        assert id(flip_results['imgs']) == id(results['imgs'])
+        assert np.shape(flip_results['imgs']) == np.shape(imgs)
+
+        # flip flow images horizontally
+        imgs = [
+            np.arange(16).reshape(4, 4).astype(np.float32),
+            np.arange(16, 32).reshape(4, 4).astype(np.float32)
+        ]
+        results = dict(imgs=copy.deepcopy(imgs), modality='Flow')
+        flip = Flip(flip_ratio=1, direction='horizontal')
+        flip_results = flip(results)
+        assert assert_dict_has_keys(flip_results, target_keys)
+        imgs = [x.reshape(4, 4, 1) for x in imgs]
+        flip_results['imgs'] = [
+            x.reshape(4, 4, 1) for x in flip_results['imgs']
+        ]
+        if flip_results['flip'] is True:
+            assert check_flip([imgs[0]],
+                              [mmcv.iminvert(flip_results['imgs'][0])],
+                              flip_results['flip_direction'])
+            assert check_flip([imgs[1]], [flip_results['imgs'][1]],
+                              flip_results['flip_direction'])
+        assert id(flip_results['imgs']) == id(results['imgs'])
+        assert np.shape(flip_results['imgs']) == np.shape(imgs)
+
+        # always flip imgs vertivally.
+        imgs = list(np.random.rand(2, 64, 64, 3))
+        results = dict(imgs=copy.deepcopy(imgs), modality='RGB')
+        flip = Flip(flip_ratio=1, direction='vertical')
+        flip_results = flip(results)
+        assert assert_dict_has_keys(flip_results, target_keys)
+        if flip_results['flip'] is True:
+            assert check_flip(imgs, flip_results['imgs'],
+                              flip_results['flip_direction'])
+        assert id(flip_results['imgs']) == id(results['imgs'])
+        assert np.shape(flip_results['imgs']) == np.shape(imgs)
+
+        assert repr(flip) == (f'{flip.__class__.__name__}'
+                              f'(flip_ratio={1}, direction=vertical, '
+                              f'flip_label_map={None}, lazy={False})')
+
+        # transform label for the flipped image with the specific label.
+        _flip_label_map = {4: 6}
+        imgs = list(np.random.rand(2, 64, 64, 3))
+
+        # the label should be mapped.
+        results = dict(imgs=copy.deepcopy(imgs), modality='RGB', label=4)
+        flip = Flip(
+            flip_ratio=1,
+            direction='horizontal',
+            flip_label_map=_flip_label_map)
+        flip_results = flip(results)
+        assert results['label'] == 6
+
+        # the label should not be mapped.
+        results = dict(imgs=copy.deepcopy(imgs), modality='RGB', label=3)
+        flip = Flip(
+            flip_ratio=1,
+            direction='horizontal',
+            flip_label_map=_flip_label_map)
+        flip_results = flip(results)
+        assert results['label'] == 3
+
+        # flip the keypoints
+        results = dict(
+            keypoint=np.array([[1, 1], [63, 63]]).reshape([1, 1, 2, 2]),
+            modality='Pose',
+            img_shape=(64, 64))
+        flip = Flip(
+            flip_ratio=1, direction='horizontal', left_kp=[0], right_kp=[1])
+        flip_results = flip(results)
+        assert_array_almost_equal(flip_results['keypoint'][0, 0],
+                                  np.array([[1, 63], [63, 1]]))
+
+        results = dict(
+            keypoint=np.array([[1, 1], [63, 63]]).reshape([1, 1, 2, 2]),
+            modality='Pose',
+            img_shape=(64, 64))
+        flip = Flip(
+            flip_ratio=1, direction='horizontal', left_kp=[], right_kp=[])
+        flip_results = flip(results)
+        assert_array_almost_equal(flip_results['keypoint'][0, 0],
+                                  np.array([[63, 1], [1, 63]]))
+
+        with pytest.raises(AssertionError):
+            results = dict(
+                keypoint=np.array([[1, 1], [63, 63]]).reshape([1, 1, 2, 2]),
+                modality='Pose',
+                img_shape=(64, 64))
+            flip = Flip(
+                flip_ratio=1, direction='vertical', left_kp=[], right_kp=[])
+            flip_results = flip(results)
+
+
+class TestLazy:
+
+    @staticmethod
+    def test_init_lazy():
+        from mmaction.datasets.transforms.processing import \
+            _init_lazy_if_proper  # noqa: E501
+        with pytest.raises(AssertionError):
+            # use lazy operation but "lazy" not in results
+            result = dict(lazy=dict(), img_shape=[64, 64])
+            _init_lazy_if_proper(result, False)
+
+        lazy_keys = [
+            'original_shape', 'crop_bbox', 'flip', 'flip_direction',
+            'interpolation'
+        ]
+
+        # 'img_shape' not in results
+        result = dict(imgs=list(np.random.randn(3, 64, 64, 3)))
+        _init_lazy_if_proper(result, True)
+        assert assert_dict_has_keys(result, ['imgs', 'lazy', 'img_shape'])
+        assert assert_dict_has_keys(result['lazy'], lazy_keys)
+
+        # 'img_shape' in results
+        result = dict(img_shape=[64, 64])
+        _init_lazy_if_proper(result, True)
+        assert assert_dict_has_keys(result, ['lazy', 'img_shape'])
+        assert assert_dict_has_keys(result['lazy'], lazy_keys)
+
+        # do not use lazy operation
+        result = dict(img_shape=[64, 64])
+        _init_lazy_if_proper(result, False)
+        assert assert_dict_has_keys(result, ['img_shape'])
+        assert 'lazy' not in result
+
+    @staticmethod
+    def test_random_crop_lazy():
+        with pytest.raises(TypeError):
+            # size must be an int
+            RandomCrop(size=(112, 112), lazy=True)
+        with pytest.raises(AssertionError):
+            # "size > height" or "size > width" is not allowed
+            imgs = list(np.random.rand(2, 224, 341, 3))
+            results = dict(imgs=imgs)
+            random_crop = RandomCrop(size=320, lazy=True)
+            random_crop(results)
+
+        target_keys = ['imgs', 'crop_bbox', 'img_shape', 'lazy']
+
+        # General case
+        imgs = list(np.random.rand(2, 224, 341, 3))
+        results = dict(imgs=imgs)
+        random_crop = RandomCrop(size=224, lazy=True)
+        random_crop_result = random_crop(results)
+        assert assert_dict_has_keys(random_crop_result, target_keys)
+        assert id(imgs) == id(random_crop_result['imgs'])
+        random_crop_result_fuse = Fuse()(random_crop_result)
+        assert 'lazy' not in random_crop_result_fuse
+        assert check_crop(imgs, random_crop_result_fuse['imgs'],
+                          results['crop_bbox'])
+        h, w = random_crop_result_fuse['img_shape']
+        assert h == w == 224
+
+        # Test the case that no need for cropping
+        imgs = list(np.random.rand(2, 224, 224, 3))
+        results = dict(imgs=imgs)
+        random_crop = RandomCrop(size=224, lazy=True)
+        random_crop_result = random_crop(results)
+        assert assert_dict_has_keys(random_crop_result, target_keys)
+        assert id(imgs) == id(random_crop_result['imgs'])
+        random_crop_result_fuse = Fuse()(random_crop_result)
+        assert 'lazy' not in random_crop_result_fuse
+        assert check_crop(imgs, random_crop_result_fuse['imgs'],
+                          results['crop_bbox'])
+        h, w = random_crop_result_fuse['img_shape']
+        assert h == w == 224
+
+        # Test the one-side-equal case
+        imgs = list(np.random.rand(2, 224, 225, 3))
+        results = dict(imgs=imgs)
+        random_crop = RandomCrop(size=224, lazy=True)
+        random_crop_result = random_crop(results)
+        assert assert_dict_has_keys(random_crop_result, target_keys)
+        assert id(imgs) == id(random_crop_result['imgs'])
+        random_crop_result_fuse = Fuse()(random_crop_result)
+        assert 'lazy' not in random_crop_result_fuse
+        assert check_crop(imgs, random_crop_result_fuse['imgs'],
+                          results['crop_bbox'])
+        h, w = random_crop_result_fuse['img_shape']
+        assert h == w == 224
+
+        assert repr(random_crop) == (f'{random_crop.__class__.__name__}'
+                                     f'(size={224}, lazy={True})')
+
+    @staticmethod
+    def test_random_resized_crop_lazy():
+        target_keys = ['imgs', 'crop_bbox', 'img_shape', 'lazy']
+        # There will be a slight difference because of rounding
+        eps = 0.01
+        imgs = list(np.random.rand(2, 256, 341, 3))
+        results = dict(imgs=imgs)
+
+        with pytest.raises(AssertionError):
+            # area_range[0] > area_range[1], which is wrong
+            random_crop = RandomResizedCrop(area_range=(0.9, 0.7), lazy=True)
+            random_crop(results)
+        with pytest.raises(AssertionError):
+            # 0 > area_range[0] and area_range[1] > 1, which is wrong
+            random_crop = RandomResizedCrop(
+                aspect_ratio_range=(-0.1, 2.0), lazy=True)
+            random_crop(results)
+
+        random_crop = RandomResizedCrop(lazy=True)
+        random_crop_result = random_crop(results)
+        assert assert_dict_has_keys(random_crop_result, target_keys)
+        assert id(imgs) == id(random_crop_result['imgs'])
+        random_crop_result_fuse = Fuse()(random_crop_result)
+        assert check_crop(imgs, random_crop_result_fuse['imgs'],
+                          results['crop_bbox'])
+        h, w = random_crop_result['img_shape']
+        assert ((0.08 - eps <= h * w / 256 / 341)
+                and (h * w / 256 / 341 <= 1 + eps))
+        assert (3. / 4. - eps <= h / w) and (h / w - eps <= 4. / 3.)
+        assert repr(random_crop) == (f'{random_crop.__class__.__name__}'
+                                     f'(area_range={(0.08, 1.0)}, '
+                                     f'aspect_ratio_range={(3 / 4, 4 / 3)}, '
+                                     f'lazy={True})')
+
+        random_crop = RandomResizedCrop(
+            area_range=(0.9, 0.9), aspect_ratio_range=(10.0, 10.1), lazy=True)
+        # Test fallback cases by very big area range
+        imgs = np.random.rand(2, 256, 341, 3)
+        results = dict(imgs=imgs)
+        random_crop_result = random_crop(results)
+        assert assert_dict_has_keys(random_crop_result, target_keys)
+        assert id(imgs) == id(random_crop_result['imgs'])
+        random_crop_result_fuse = Fuse()(random_crop_result)
+        assert check_crop(imgs, random_crop_result_fuse['imgs'],
+                          results['crop_bbox'])
+        h, w = random_crop_result['img_shape']
+        assert h == w == 256
+
+    @staticmethod
+    def test_multi_scale_crop_lazy():
+        with pytest.raises(TypeError):
+            # input_size must be int or tuple of int
+            MultiScaleCrop(0.5, lazy=True)
+
+        with pytest.raises(TypeError):
+            # input_size must be int or tuple of int
+            MultiScaleCrop('224', lazy=True)
+
+        with pytest.raises(TypeError):
+            # scales must be tuple.
+            MultiScaleCrop(
+                224, scales=[
+                    1,
+                ], lazy=True)
+
+        with pytest.raises(ValueError):
+            # num_fix_crops must be in [5, 13]
+            MultiScaleCrop(224, num_fixed_crops=6, lazy=True)
+
+        target_keys = ['imgs', 'crop_bbox', 'img_shape', 'scales']
+
+        # MultiScaleCrop with normal crops.
+        imgs = list(np.random.rand(2, 256, 341, 3))
+        results = dict(imgs=imgs)
+        config = dict(
+            input_size=224,
+            scales=(1, 0.8),
+            random_crop=False,
+            max_wh_scale_gap=0,
+            lazy=True)
+        multi_scale_crop = MultiScaleCrop(**config)
+        multi_scale_crop_result = multi_scale_crop(results)
+        assert id(imgs) == id(multi_scale_crop_result['imgs'])
+        assert assert_dict_has_keys(multi_scale_crop_result, target_keys)
+        multi_scale_crop_result_fuse = Fuse()(multi_scale_crop_result)
+        assert check_crop(imgs, multi_scale_crop_result_fuse['imgs'],
+                          multi_scale_crop_result['crop_bbox'])
+        assert multi_scale_crop_result_fuse['img_shape'] in [(256, 256),
+                                                             (204, 204)]
+
+        # MultiScaleCrop with more fixed crops.
+        imgs = list(np.random.rand(2, 256, 341, 3))
+        results = dict(imgs=imgs)
+        config = dict(
+            input_size=224,
+            scales=(1, 0.8),
+            random_crop=False,
+            max_wh_scale_gap=0,
+            num_fixed_crops=13,
+            lazy=True)
+        multi_scale_crop = MultiScaleCrop(**config)
+        multi_scale_crop_result = multi_scale_crop(results)
+        assert id(imgs) == id(multi_scale_crop_result['imgs'])
+        assert assert_dict_has_keys(multi_scale_crop_result, target_keys)
+        multi_scale_crop_result_fuse = Fuse()(multi_scale_crop_result)
+        assert check_crop(imgs, multi_scale_crop_result_fuse['imgs'],
+                          multi_scale_crop_result['crop_bbox'])
+        assert multi_scale_crop_result_fuse['img_shape'] in [(256, 256),
+                                                             (204, 204)]
+
+        # MultiScaleCrop with random crop.
+        imgs = list(np.random.rand(2, 256, 341, 3))
+        results = dict(imgs=imgs)
+        config = dict(
+            input_size=224,
+            scales=(1, 0.8),
+            random_crop=True,
+            max_wh_scale_gap=0,
+            lazy=True)
+        multi_scale_crop = MultiScaleCrop(**config)
+        multi_scale_crop_result = multi_scale_crop(results)
+        assert id(imgs) == id(multi_scale_crop_result['imgs'])
+        assert assert_dict_has_keys(multi_scale_crop_result, target_keys)
+        multi_scale_crop_result_fuse = Fuse()(multi_scale_crop_result)
+        assert check_crop(imgs, multi_scale_crop_result_fuse['imgs'],
+                          multi_scale_crop_result['crop_bbox'])
+        assert (multi_scale_crop_result_fuse['img_shape'] in [(256, 256),
+                                                              (204, 204)])
+
+        assert repr(multi_scale_crop) == (
+            f'{multi_scale_crop.__class__.__name__}'
+            f'(input_size={(224, 224)}, scales={(1, 0.8)}, '
+            f'max_wh_scale_gap={0}, random_crop={True}, '
+            f'num_fixed_crops={5}, lazy={True})')
+
+    @staticmethod
+    def test_resize_lazy():
+        with pytest.raises(ValueError):
+            # scale must be positive
+            Resize(-0.5, lazy=True)
+
+        with pytest.raises(TypeError):
+            # scale must be tuple of int
+            Resize('224', lazy=True)
+
+        target_keys = [
+            'imgs', 'img_shape', 'keep_ratio', 'scale_factor', 'modality'
+        ]
+
+        # scale with -1 to indicate np.inf
+        imgs = list(np.random.rand(2, 240, 320, 3))
+        results = dict(imgs=imgs, modality='RGB')
+        resize = Resize(scale=(-1, 256), keep_ratio=True, lazy=True)
+        resize_results = resize(results)
+        assert id(imgs) == id(resize_results['imgs'])
+        assert assert_dict_has_keys(resize_results, target_keys)
+        resize_results_fuse = Fuse()(resize_results)
+        assert np.all(resize_results_fuse['scale_factor'] == np.array(
+            [341 / 320, 256 / 240], dtype=np.float32))
+        assert resize_results_fuse['img_shape'] == (256, 341)
+
+        # scale with a normal tuple (320, 320) to indicate np.inf
+        imgs = list(np.random.rand(2, 240, 320, 3))
+        results = dict(imgs=imgs, modality='RGB')
+        resize = Resize(scale=(320, 320), keep_ratio=False, lazy=True)
+        resize_results = resize(results)
+        assert id(imgs) == id(resize_results['imgs'])
+        assert assert_dict_has_keys(resize_results, target_keys)
+        resize_results_fuse = Fuse()(resize_results)
+        assert np.all(resize_results_fuse['scale_factor'] == np.array(
+            [1, 320 / 240], dtype=np.float32))
+        assert resize_results_fuse['img_shape'] == (320, 320)
+
+        # scale with a normal tuple (341, 256) to indicate np.inf
+        imgs = list(np.random.rand(2, 240, 320, 3))
+        results = dict(imgs=imgs, modality='RGB')
+        resize = Resize(scale=(341, 256), keep_ratio=False, lazy=True)
+        resize_results = resize(results)
+        assert id(imgs) == id(resize_results['imgs'])
+        assert assert_dict_has_keys(resize_results, target_keys)
+        resize_results_fuse = Fuse()(resize_results)
+        assert np.all(resize_results_fuse['scale_factor'] == np.array(
+            [341 / 320, 256 / 240], dtype=np.float32))
+        assert resize_results_fuse['img_shape'] == (256, 341)
+
+        assert repr(resize) == (f'{resize.__class__.__name__}'
+                                f'(scale={(341, 256)}, keep_ratio={False}, ' +
+                                f'interpolation=bilinear, lazy={True})')
+
+    @staticmethod
+    def test_flip_lazy():
+        with pytest.raises(ValueError):
+            Flip(direction='vertically', lazy=True)
+
+        target_keys = ['imgs', 'flip_direction', 'modality']
+
+        # do not flip imgs.
+        imgs = list(np.random.rand(2, 64, 64, 3))
+        imgs_tmp = imgs.copy()
+        results = dict(imgs=imgs_tmp, modality='RGB')
+        flip = Flip(flip_ratio=0, direction='horizontal', lazy=True)
+        flip_results = flip(results)
+        assert id(imgs_tmp) == id(flip_results['imgs'])
+        assert assert_dict_has_keys(flip_results, target_keys)
+        flip_results_fuse = Fuse()(flip_results)
+        assert np.equal(imgs, results['imgs']).all()
+        assert id(flip_results['imgs']) == id(results['imgs'])
+        assert flip_results_fuse['imgs'][0].shape == (64, 64, 3)
+
+        # always flip imgs horizontally.
+        imgs = list(np.random.rand(2, 64, 64, 3))
+        imgs_tmp = imgs.copy()
+        results = dict(imgs=imgs_tmp, modality='RGB')
+        flip = Flip(flip_ratio=1, direction='horizontal', lazy=True)
+        flip_results = flip(results)
+        assert id(imgs_tmp) == id(flip_results['imgs'])
+        assert assert_dict_has_keys(flip_results, target_keys)
+        flip_results_fuse = Fuse()(flip_results)
+        assert check_flip(imgs, flip_results['imgs'],
+                          flip_results['flip_direction'])
+        assert id(flip_results['imgs']) == id(results['imgs'])
+        assert flip_results_fuse['imgs'][0].shape == (64, 64, 3)
+
+        # always flip imgs vertivally.
+        imgs = list(np.random.rand(2, 64, 64, 3))
+        imgs_tmp = imgs.copy()
+        results = dict(imgs=imgs_tmp, modality='RGB')
+        flip = Flip(flip_ratio=1, direction='vertical', lazy=True)
+        flip_results = flip(results)
+        assert id(imgs_tmp) == id(flip_results['imgs'])
+        assert assert_dict_has_keys(flip_results, target_keys)
+        flip_results_fuse = Fuse()(flip_results)
+        assert check_flip(imgs, flip_results['imgs'],
+                          flip_results['flip_direction'])
+        assert id(flip_results['imgs']) == id(results['imgs'])
+        assert flip_results_fuse['imgs'][0].shape == (64, 64, 3)
+
+        assert repr(flip) == (f'{flip.__class__.__name__}'
+                              f'(flip_ratio={1}, direction=vertical, '
+                              f'flip_label_map={None}, lazy={True})')
+
+    @staticmethod
+    def test_center_crop_lazy():
+        with pytest.raises(TypeError):
+            # crop_size must be int or tuple of int
+            CenterCrop(0.5)
+
+        with pytest.raises(TypeError):
+            # crop_size must be int or tuple of int
+            CenterCrop('224')
+
+        # center crop with crop_size 224
+        imgs = list(np.random.rand(2, 240, 320, 3))
+        results = dict(imgs=imgs)
+        center_crop = CenterCrop(crop_size=224, lazy=True)
+        center_crop_results = center_crop(results)
+
+        target_keys = ['imgs', 'crop_bbox', 'img_shape']
+        assert assert_dict_has_keys(center_crop_results, target_keys)
+        center_crop_results_fuse = Fuse()(center_crop_results)
+        assert check_crop(imgs, center_crop_results_fuse['imgs'],
+                          center_crop_results['crop_bbox'])
+        assert np.all(center_crop_results_fuse['crop_bbox'] == np.array(
+            [48, 8, 272, 232]))
+        assert center_crop_results_fuse['img_shape'] == (224, 224)
+
+        assert repr(center_crop) == (f'{center_crop.__class__.__name__}'
+                                     f'(crop_size={(224, 224)}, lazy={True})')
diff --git a/tests/datasets/transforms/test_sampling.py b/tests/datasets/transforms/test_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..69ba27d3cd52270af5e24563382e6c8756883d50
--- /dev/null
+++ b/tests/datasets/transforms/test_sampling.py
@@ -0,0 +1,863 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+
+import mmcv
+import numpy as np
+from mmengine.testing import assert_dict_has_keys
+from numpy.testing import assert_array_equal
+
+from mmaction.datasets.transforms import (AudioFeatureSelector,
+                                          DenseSampleFrames, SampleAVAFrames,
+                                          SampleFrames, UntrimmedSampleFrames)
+
+
+class BaseTestLoading:
+
+    @classmethod
+    def setup_class(cls):
+        cls.data_prefix = osp.normpath(
+            osp.join(osp.dirname(__file__), '../../data'))
+        cls.img_path = osp.join(cls.data_prefix, 'test.jpg')
+        cls.video_path = osp.join(cls.data_prefix, 'test.mp4')
+        cls.wav_path = osp.join(cls.data_prefix, 'test.wav')
+        cls.audio_spec_path = osp.join(cls.data_prefix, 'test.npy')
+        cls.img_dir = osp.join(cls.data_prefix, 'imgs')
+        cls.raw_feature_dir = osp.join(cls.data_prefix, 'activitynet_features')
+        cls.bsp_feature_dir = osp.join(cls.data_prefix, 'bsp_features')
+        cls.proposals_dir = osp.join(cls.data_prefix, 'proposals')
+
+        cls.total_frames = 5
+        cls.filename_tmpl = 'img_{:05}.jpg'
+        cls.flow_filename_tmpl = '{}_{:05d}.jpg'
+        video_total_frames = len(mmcv.VideoReader(cls.video_path))
+        cls.audio_total_frames = video_total_frames
+
+        cls.video_results = dict(
+            filename=cls.video_path,
+            label=1,
+            total_frames=video_total_frames,
+            start_index=0)
+        cls.audio_results = dict(
+            audios=np.random.randn(1280, ),
+            audio_path=cls.wav_path,
+            total_frames=cls.audio_total_frames,
+            label=1,
+            start_index=0)
+        cls.audio_feature_results = dict(
+            audios=np.random.randn(128, 80),
+            audio_path=cls.audio_spec_path,
+            total_frames=cls.audio_total_frames,
+            label=1,
+            start_index=0)
+        cls.frame_results = dict(
+            frame_dir=cls.img_dir,
+            total_frames=cls.total_frames,
+            filename_tmpl=cls.filename_tmpl,
+            start_index=1,
+            modality='RGB',
+            offset=0,
+            label=1)
+        cls.flow_frame_results = dict(
+            frame_dir=cls.img_dir,
+            total_frames=cls.total_frames,
+            filename_tmpl=cls.flow_filename_tmpl,
+            modality='Flow',
+            offset=0,
+            label=1)
+        cls.action_results = dict(
+            video_name='v_test1',
+            data_prefix=cls.raw_feature_dir,
+            temporal_scale=5,
+            boundary_ratio=0.1,
+            duration_second=10,
+            duration_frame=10,
+            feature_frame=8,
+            annotations=[{
+                'segment': [3.0, 5.0],
+                'label': 'Rock climbing'
+            }])
+        """
+        from mmaction.datasets.ssn_dataset import SSNInstance
+        cls.proposal_results = dict(
+            frame_dir=cls.img_dir,
+            video_id='imgs',
+            total_frames=cls.total_frames,
+            filename_tmpl=cls.filename_tmpl,
+            start_index=1,
+            out_proposals=[[['imgs', SSNInstance(1, 4, 10, 1, 1, 1)], 0],
+                           [['imgs', SSNInstance(2, 5, 10, 2, 1, 1)], 0]])
+        """
+
+        cls.ava_results = dict(
+            fps=30, timestamp=902, timestamp_start=840, shot_info=(0, 27000))
+
+        cls.hvu_label_example1 = dict(
+            categories=['action', 'object', 'scene', 'concept'],
+            category_nums=[2, 5, 3, 2],
+            label=dict(action=[0], object=[2, 3], scene=[0, 1]))
+        cls.hvu_label_example2 = dict(
+            categories=['action', 'object', 'scene', 'concept'],
+            category_nums=[2, 5, 3, 2],
+            label=dict(action=[1], scene=[1, 2], concept=[1]))
+
+
+class TestSampling(BaseTestLoading):
+
+    def test_sample_frames(self):
+        target_keys = [
+            'frame_inds', 'clip_len', 'frame_interval', 'num_clips',
+            'total_frames'
+        ]
+
+        # Sample Frame with tail Frames
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        config = dict(
+            clip_len=3, frame_interval=1, num_clips=5, keep_tail_frames=True)
+        sample_frames = SampleFrames(**config)
+        sample_frames(video_result)
+        sample_frames(frame_result)
+
+        # Sample Frame with no temporal_jitter
+        # clip_len=3, frame_interval=1, num_clips=5
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        config = dict(
+            clip_len=3, frame_interval=1, num_clips=5, temporal_jitter=False)
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 15
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 15
+        assert np.max(sample_frames_results['frame_inds']) <= 5
+        assert np.min(sample_frames_results['frame_inds']) >= 1
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={3}, '
+                                       f'frame_interval={1}, '
+                                       f'num_clips={5}, '
+                                       f'temporal_jitter={False}, '
+                                       f'twice_sample={False}, '
+                                       f'out_of_bound_opt=loop, '
+                                       f'test_mode={False})')
+
+        # Sample Frame with no temporal_jitter
+        # clip_len=5, frame_interval=1, num_clips=5,
+        # out_of_bound_opt='repeat_last'
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        config = dict(
+            clip_len=5,
+            frame_interval=1,
+            num_clips=5,
+            temporal_jitter=False,
+            out_of_bound_opt='repeat_last')
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={5}, '
+                                       f'frame_interval={1}, '
+                                       f'num_clips={5}, '
+                                       f'temporal_jitter={False}, '
+                                       f'twice_sample={False}, '
+                                       f'out_of_bound_opt=repeat_last, '
+                                       f'test_mode={False})')
+
+        def check_monotonous(arr):
+            length = arr.shape[0]
+            for i in range(length - 1):
+                if arr[i] > arr[i + 1]:
+                    return False
+            return True
+
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 25
+        frame_inds = sample_frames_results['frame_inds'].reshape([5, 5])
+        for i in range(5):
+            assert check_monotonous(frame_inds[i])
+
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 25
+        frame_inds = sample_frames_results['frame_inds'].reshape([5, 5])
+        for i in range(5):
+            assert check_monotonous(frame_inds[i])
+        assert np.max(sample_frames_results['frame_inds']) <= 5
+        assert np.min(sample_frames_results['frame_inds']) >= 1
+
+        # Sample Frame with temporal_jitter
+        # clip_len=4, frame_interval=2, num_clips=5
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        config = dict(
+            clip_len=4, frame_interval=2, num_clips=5, temporal_jitter=True)
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 20
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 20
+        assert np.max(sample_frames_results['frame_inds']) <= 5
+        assert np.min(sample_frames_results['frame_inds']) >= 1
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={4}, '
+                                       f'frame_interval={2}, '
+                                       f'num_clips={5}, '
+                                       f'temporal_jitter={True}, '
+                                       f'twice_sample={False}, '
+                                       f'out_of_bound_opt=loop, '
+                                       f'test_mode={False})')
+
+        # Sample Frame with no temporal_jitter in test mode
+        # clip_len=4, frame_interval=1, num_clips=6
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        config = dict(
+            clip_len=4,
+            frame_interval=1,
+            num_clips=6,
+            temporal_jitter=False,
+            test_mode=True)
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 24
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 24
+        assert np.max(sample_frames_results['frame_inds']) <= 5
+        assert np.min(sample_frames_results['frame_inds']) >= 1
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={4}, '
+                                       f'frame_interval={1}, '
+                                       f'num_clips={6}, '
+                                       f'temporal_jitter={False}, '
+                                       f'twice_sample={False}, '
+                                       f'out_of_bound_opt=loop, '
+                                       f'test_mode={True})')
+
+        # Sample Frame with no temporal_jitter in test mode
+        # clip_len=3, frame_interval=1, num_clips=6
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        config = dict(
+            clip_len=3,
+            frame_interval=1,
+            num_clips=6,
+            temporal_jitter=False,
+            test_mode=True)
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 18
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 18
+        assert np.max(sample_frames_results['frame_inds']) <= 5
+        assert np.min(sample_frames_results['frame_inds']) >= 1
+
+        # Sample Frame with no temporal_jitter to get clip_offsets
+        # clip_len=1, frame_interval=1, num_clips=8
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        frame_result['total_frames'] = 6
+        config = dict(
+            clip_len=1,
+            frame_interval=1,
+            num_clips=8,
+            temporal_jitter=False,
+            test_mode=True)
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 8
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 8
+        assert_array_equal(sample_frames_results['frame_inds'],
+                           np.array([1, 2, 2, 3, 4, 5, 5, 6]))
+
+        # Sample Frame with no temporal_jitter to get clip_offsets
+        # clip_len=1, frame_interval=1, num_clips=8
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        frame_result['total_frames'] = 6
+        config = dict(
+            clip_len=1,
+            frame_interval=1,
+            num_clips=8,
+            temporal_jitter=False,
+            test_mode=True)
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert sample_frames_results['start_index'] == 0
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 8
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 8
+        assert_array_equal(sample_frames_results['frame_inds'],
+                           np.array([1, 2, 2, 3, 4, 5, 5, 6]))
+
+        # Sample Frame with no temporal_jitter to get clip_offsets zero
+        # clip_len=6, frame_interval=1, num_clips=1
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        frame_result['total_frames'] = 5
+        config = dict(
+            clip_len=6,
+            frame_interval=1,
+            num_clips=1,
+            temporal_jitter=False,
+            test_mode=True)
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert sample_frames_results['start_index'] == 0
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 6
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 6
+        assert_array_equal(sample_frames_results['frame_inds'],
+                           [1, 2, 3, 4, 5, 1])
+
+        # Sample Frame with no temporal_jitter to get avg_interval <= 0
+        # clip_len=12, frame_interval=1, num_clips=20
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        frame_result['total_frames'] = 30
+        config = dict(
+            clip_len=12,
+            frame_interval=1,
+            num_clips=20,
+            temporal_jitter=False,
+            test_mode=False)
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert sample_frames_results['start_index'] == 0
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 240
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 240
+        assert np.max(sample_frames_results['frame_inds']) <= 30
+        assert np.min(sample_frames_results['frame_inds']) >= 1
+
+        # Sample Frame with no temporal_jitter to get clip_offsets
+        # clip_len=1, frame_interval=1, num_clips=8
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        frame_result['total_frames'] = 6
+        config = dict(
+            clip_len=1,
+            frame_interval=1,
+            num_clips=8,
+            temporal_jitter=False,
+            test_mode=False)
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert sample_frames_results['start_index'] == 0
+        assert len(sample_frames_results['frame_inds']) == 8
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 8
+        assert_array_equal(sample_frames_results['frame_inds'],
+                           np.array([1, 2, 3, 3, 4, 5, 5, 6]))
+
+        # Sample Frame with no temporal_jitter to get clip_offsets zero
+        # clip_len=12, frame_interval=1, num_clips=2
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        frame_result['total_frames'] = 10
+        config = dict(
+            clip_len=12,
+            frame_interval=1,
+            num_clips=2,
+            temporal_jitter=False,
+            test_mode=False)
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert sample_frames_results['start_index'] == 0
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 24
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 24
+        assert np.max(sample_frames_results['frame_inds']) <= 10
+        assert np.min(sample_frames_results['frame_inds']) >= 1
+
+        # Sample Frame using twice sample
+        # clip_len=12, frame_interval=1, num_clips=2
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        frame_result['total_frames'] = 40
+        config = dict(
+            clip_len=12,
+            frame_interval=1,
+            num_clips=2,
+            temporal_jitter=False,
+            twice_sample=True,
+            test_mode=True)
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert sample_frames_results['start_index'] == 0
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 48
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 48
+        assert np.max(sample_frames_results['frame_inds']) <= 40
+        assert np.min(sample_frames_results['frame_inds']) >= 1
+
+    def test_dense_sample_frames(self):
+        target_keys = [
+            'frame_inds', 'clip_len', 'frame_interval', 'num_clips',
+            'total_frames'
+        ]
+
+        # Dense sample with no temporal_jitter in test mode
+        # clip_len=4, frame_interval=1, num_clips=6
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        config = dict(
+            clip_len=4,
+            frame_interval=1,
+            num_clips=6,
+            temporal_jitter=False,
+            test_mode=True)
+        dense_sample_frames = DenseSampleFrames(**config)
+        dense_sample_frames_results = dense_sample_frames(video_result)
+        assert dense_sample_frames_results['start_index'] == 0
+        assert assert_dict_has_keys(dense_sample_frames_results, target_keys)
+        assert len(dense_sample_frames_results['frame_inds']) == 240
+        dense_sample_frames_results = dense_sample_frames(frame_result)
+        assert len(dense_sample_frames_results['frame_inds']) == 240
+        assert repr(dense_sample_frames) == (
+            f'{dense_sample_frames.__class__.__name__}('
+            f'clip_len={4}, '
+            f'frame_interval={1}, '
+            f'num_clips={6}, '
+            f'sample_range={64}, '
+            f'num_sample_positions={10}, '
+            f'temporal_jitter={False}, '
+            f'out_of_bound_opt=loop, '
+            f'test_mode={True})')
+
+        # Dense sample with no temporal_jitter
+        # clip_len=4, frame_interval=1, num_clips=6
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        config = dict(
+            clip_len=4, frame_interval=1, num_clips=6, temporal_jitter=False)
+        dense_sample_frames = DenseSampleFrames(**config)
+        dense_sample_frames_results = dense_sample_frames(video_result)
+        assert dense_sample_frames_results['start_index'] == 0
+        assert assert_dict_has_keys(dense_sample_frames_results, target_keys)
+        assert len(dense_sample_frames_results['frame_inds']) == 24
+        dense_sample_frames_results = dense_sample_frames(frame_result)
+        assert len(dense_sample_frames_results['frame_inds']) == 24
+
+        # Dense sample with no temporal_jitter, sample_range=32 in test mode
+        # clip_len=4, frame_interval=1, num_clips=6
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        config = dict(
+            clip_len=4,
+            frame_interval=1,
+            num_clips=6,
+            sample_range=32,
+            temporal_jitter=False,
+            test_mode=True)
+        dense_sample_frames = DenseSampleFrames(**config)
+        dense_sample_frames_results = dense_sample_frames(video_result)
+        assert dense_sample_frames_results['start_index'] == 0
+        assert assert_dict_has_keys(dense_sample_frames_results, target_keys)
+        assert len(dense_sample_frames_results['frame_inds']) == 240
+        dense_sample_frames_results = dense_sample_frames(frame_result)
+        assert len(dense_sample_frames_results['frame_inds']) == 240
+
+        # Dense sample with no temporal_jitter, sample_range=32
+        # clip_len=4, frame_interval=1, num_clips=6
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        config = dict(
+            clip_len=4,
+            frame_interval=1,
+            num_clips=6,
+            sample_range=32,
+            temporal_jitter=False)
+        dense_sample_frames = DenseSampleFrames(**config)
+        dense_sample_frames_results = dense_sample_frames(video_result)
+        assert dense_sample_frames_results['start_index'] == 0
+        assert assert_dict_has_keys(dense_sample_frames_results, target_keys)
+        assert len(dense_sample_frames_results['frame_inds']) == 24
+        dense_sample_frames_results = dense_sample_frames(frame_result)
+        assert len(dense_sample_frames_results['frame_inds']) == 24
+        assert repr(dense_sample_frames) == (
+            f'{dense_sample_frames.__class__.__name__}('
+            f'clip_len={4}, '
+            f'frame_interval={1}, '
+            f'num_clips={6}, '
+            f'sample_range={32}, '
+            f'num_sample_positions={10}, '
+            f'temporal_jitter={False}, '
+            f'out_of_bound_opt=loop, '
+            f'test_mode={False})')
+
+        # Dense sample with no temporal_jitter, sample_range=1000 to check mod
+        # clip_len=4, frame_interval=1, num_clips=6
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        config = dict(
+            clip_len=4,
+            frame_interval=1,
+            num_clips=6,
+            sample_range=1000,
+            temporal_jitter=False)
+        dense_sample_frames = DenseSampleFrames(**config)
+        dense_sample_frames_results = dense_sample_frames(video_result)
+        assert dense_sample_frames_results['start_index'] == 0
+        assert assert_dict_has_keys(dense_sample_frames_results, target_keys)
+        assert len(dense_sample_frames_results['frame_inds']) == 24
+        dense_sample_frames_results = dense_sample_frames(frame_result)
+        assert len(dense_sample_frames_results['frame_inds']) == 24
+
+        # Dense sample with no temporal_jitter in test mode
+        # sample_range=32, num_sample_positions=5
+        # clip_len=4, frame_interval=1, num_clips=6
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        config = dict(
+            clip_len=4,
+            frame_interval=1,
+            num_clips=6,
+            num_sample_positions=5,
+            sample_range=32,
+            temporal_jitter=False,
+            test_mode=True)
+        dense_sample_frames = DenseSampleFrames(**config)
+        dense_sample_frames_results = dense_sample_frames(video_result)
+        assert dense_sample_frames_results['start_index'] == 0
+        assert assert_dict_has_keys(dense_sample_frames_results, target_keys)
+        assert len(dense_sample_frames_results['frame_inds']) == 120
+        dense_sample_frames_results = dense_sample_frames(frame_result)
+        assert len(dense_sample_frames_results['frame_inds']) == 120
+        assert repr(dense_sample_frames) == (
+            f'{dense_sample_frames.__class__.__name__}('
+            f'clip_len={4}, '
+            f'frame_interval={1}, '
+            f'num_clips={6}, '
+            f'sample_range={32}, '
+            f'num_sample_positions={5}, '
+            f'temporal_jitter={False}, '
+            f'out_of_bound_opt=loop, '
+            f'test_mode={True})')
+
+    def test_untrim_sample_frames(self):
+
+        target_keys = [
+            'frame_inds', 'clip_len', 'frame_interval', 'num_clips',
+            'total_frames'
+        ]
+
+        frame_result = dict(
+            frame_dir=None,
+            total_frames=100,
+            filename_tmpl=None,
+            modality='RGB',
+            start_index=0,
+            label=1)
+        video_result = copy.deepcopy(self.video_results)
+
+        config = dict(clip_len=1, clip_interval=16)  # , start_index=0)
+        sample_frames = UntrimmedSampleFrames(**config)
+        sample_frames_results = sample_frames(frame_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 6
+        assert_array_equal(sample_frames_results['frame_inds'],
+                           np.array([8, 24, 40, 56, 72, 88]))
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={1}, '
+                                       f'clip_interval={16}, '
+                                       f'frame_interval={1})')
+
+        config = dict(clip_len=1, clip_interval=16)  # , start_index=0)
+        sample_frames = UntrimmedSampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        frame_inds = np.array(list(range(8, 300, 16)))
+        assert len(sample_frames_results['frame_inds']) == frame_inds.shape[0]
+        assert_array_equal(sample_frames_results['frame_inds'], frame_inds)
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={1}, '
+                                       f'clip_interval={16}, '
+                                       f'frame_interval={1})')
+
+        config = dict(clip_len=1, clip_interval=16)
+        sample_frames = UntrimmedSampleFrames(**config)
+        frame_result_ = copy.deepcopy(frame_result)
+        frame_result_['start_index'] = 1
+        sample_frames_results = sample_frames(frame_result_)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 6
+        assert_array_equal(sample_frames_results['frame_inds'],
+                           np.array([8, 24, 40, 56, 72, 88]) + 1)
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={1}, '
+                                       f'clip_interval={16}, '
+                                       f'frame_interval={1})')
+
+        config = dict(clip_len=3, clip_interval=16)  # , start_index=0)
+        sample_frames = UntrimmedSampleFrames(**config)
+        sample_frames_results = sample_frames(frame_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 18
+        assert_array_equal(
+            sample_frames_results['frame_inds'],
+            np.array([
+                7, 8, 9, 23, 24, 25, 39, 40, 41, 55, 56, 57, 71, 72, 73, 87,
+                88, 89
+            ]))
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={3}, '
+                                       f'clip_interval={16}, '
+                                       f'frame_interval={1})')
+
+        config = dict(
+            clip_len=3, clip_interval=16, frame_interval=4)  # , start_index=0)
+        sample_frames = UntrimmedSampleFrames(**config)
+        sample_frames_results = sample_frames(frame_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 18
+        assert_array_equal(
+            sample_frames_results['frame_inds'],
+            np.array([
+                4, 8, 12, 20, 24, 28, 36, 40, 44, 52, 56, 60, 68, 72, 76, 84,
+                88, 92
+            ]))
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={3}, '
+                                       f'clip_interval={16}, '
+                                       f'frame_interval={4})')
+
+    def test_sample_ava_frames(self):
+        target_keys = [
+            'fps', 'timestamp', 'timestamp_start', 'shot_info', 'frame_inds',
+            'clip_len', 'frame_interval'
+        ]
+        config = dict(clip_len=32, frame_interval=2)
+        sample_ava_dataset = SampleAVAFrames(**config)
+        ava_result = sample_ava_dataset(results=self.ava_results)
+        assert assert_dict_has_keys(ava_result, target_keys)
+        assert ava_result['clip_len'] == 32
+        assert ava_result['frame_interval'] == 2
+        assert len(ava_result['frame_inds']) == 32
+        assert repr(sample_ava_dataset) == (
+            f'{sample_ava_dataset.__class__.__name__}('
+            f'clip_len={32}, '
+            f'frame_interval={2}, '
+            f'test_mode={False})')
+
+        # add test case in Issue #306
+        config = dict(clip_len=8, frame_interval=8)
+        sample_ava_dataset = SampleAVAFrames(**config)
+        ava_result = sample_ava_dataset(results=self.ava_results)
+        assert assert_dict_has_keys(ava_result, target_keys)
+        assert ava_result['clip_len'] == 8
+        assert ava_result['frame_interval'] == 8
+        assert len(ava_result['frame_inds']) == 8
+        assert repr(sample_ava_dataset) == (
+            f'{sample_ava_dataset.__class__.__name__}('
+            f'clip_len={8}, '
+            f'frame_interval={8}, '
+            f'test_mode={False})')
+
+    """ TODO
+    def test_sample_proposal_frames(self):
+        target_keys = [
+            'frame_inds', 'clip_len', 'frame_interval', 'num_clips',
+            'total_frames', 'start_index'
+        ]
+
+        # test error cases
+        with pytest.raises(TypeError):
+            proposal_result = copy.deepcopy(self.proposal_results)
+            config = dict(
+                clip_len=1,
+                frame_interval=1,
+                body_segments=2,
+                aug_segments=('error', 'error'),
+                aug_ratio=0.5,
+                temporal_jitter=False)
+            sample_frames = SampleProposalFrames(**config)
+            sample_frames(proposal_result)
+
+        # test normal cases
+        # Sample Frame with no temporal_jitter
+        # clip_len=1, frame_interval=1
+        # body_segments=2, aug_segments=(1, 1)
+        proposal_result = copy.deepcopy(self.proposal_results)
+        proposal_result['total_frames'] = 9
+        config = dict(
+            clip_len=1,
+            frame_interval=1,
+            body_segments=2,
+            aug_segments=(1, 1),
+            aug_ratio=0.5,
+            temporal_jitter=False)
+        sample_frames = SampleProposalFrames(**config)
+        sample_frames_results = sample_frames(proposal_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 8
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={1}, '
+                                       f'body_segments={2}, '
+                                       f'aug_segments={(1, 1)}, '
+                                       f'aug_ratio={(0.5, 0.5)}, '
+                                       f'frame_interval={1}, '
+                                       f'test_interval={6}, '
+                                       f'temporal_jitter={False}, '
+                                       f'mode=train)')
+
+        # Sample Frame with temporal_jitter
+        # clip_len=1, frame_interval=1
+        # body_segments=2, aug_segments=(1, 1)
+        proposal_result = copy.deepcopy(self.proposal_results)
+        proposal_result['total_frames'] = 9
+        config = dict(
+            clip_len=1,
+            frame_interval=1,
+            body_segments=2,
+            aug_segments=(1, 1),
+            aug_ratio=0.5,
+            temporal_jitter=True)
+        sample_frames = SampleProposalFrames(**config)
+        sample_frames_results = sample_frames(proposal_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 8
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={1}, '
+                                       f'body_segments={2}, '
+                                       f'aug_segments={(1, 1)}, '
+                                       f'aug_ratio={(0.5, 0.5)}, '
+                                       f'frame_interval={1}, '
+                                       f'test_interval={6}, '
+                                       f'temporal_jitter={True}, '
+                                       f'mode=train)')
+
+        # Sample Frame with no temporal_jitter in val mode
+        # clip_len=1, frame_interval=1
+        # body_segments=2, aug_segments=(1, 1)
+        proposal_result = copy.deepcopy(self.proposals)
+        proposal_result['total_frames'] = 9
+        config = dict(
+            clip_len=1,
+            frame_interval=1,
+            body_segments=2,
+            aug_segments=(1, 1),
+            aug_ratio=0.5,
+            temporal_jitter=False,
+            mode='val')
+        sample_frames = SampleProposalFrames(**config)
+        sample_frames_results = sample_frames(proposal_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 8
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={1}, '
+                                       f'body_segments={2}, '
+                                       f'aug_segments={(1, 1)}, '
+                                       f'aug_ratio={(0.5, 0.5)}, '
+                                       f'frame_interval={1}, '
+                                       f'test_interval={6}, '
+                                       f'temporal_jitter={False}, '
+                                       f'mode=val)')
+
+        # Sample Frame with no temporal_jitter in test mode
+        # test_interval=2
+        proposal_result = copy.deepcopy(self.proposals)
+        proposal_result['out_proposals'] = None
+        proposal_result['total_frames'] = 10
+        config = dict(
+            clip_len=1,
+            frame_interval=1,
+            body_segments=2,
+            aug_segments=(1, 1),
+            aug_ratio=0.5,
+            test_interval=2,
+            temporal_jitter=False,
+            mode='test')
+        sample_frames = SampleProposalFrames(**config)
+        sample_frames_results = sample_frames(proposal_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 5
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={1}, '
+                                       f'body_segments={2}, '
+                                       f'aug_segments={(1, 1)}, '
+                                       f'aug_ratio={(0.5, 0.5)}, '
+                                       f'frame_interval={1}, '
+                                       f'test_interval={2}, '
+                                       f'temporal_jitter={False}, '
+                                       f'mode=test)')
+
+        # Sample Frame with no temporal_jitter to get clip_offsets zero
+        # clip_len=1, frame_interval=1
+        # body_segments=2, aug_segments=(1, 1)
+        proposal_result = copy.deepcopy(self.proposals)
+        proposal_result['total_frames'] = 3
+        config = dict(
+            clip_len=1,
+            frame_interval=1,
+            body_segments=2,
+            aug_segments=(1, 1),
+            aug_ratio=0.5,
+            temporal_jitter=False)
+        sample_frames = SampleProposalFrames(**config)
+        sample_frames_results = sample_frames(proposal_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 8
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={1}, '
+                                       f'body_segments={2}, '
+                                       f'aug_segments={(1, 1)}, '
+                                       f'aug_ratio={(0.5, 0.5)}, '
+                                       f'frame_interval={1}, '
+                                       f'test_interval={6}, '
+                                       f'temporal_jitter={False}, '
+                                       f'mode=train)')
+
+        # Sample Frame with no temporal_jitter to
+        # get clip_offsets zero in val mode
+        # clip_len=1, frame_interval=1
+        # body_segments=4, aug_segments=(2, 2)
+        proposal_result = copy.deepcopy(self.proposals)
+        proposal_result['total_frames'] = 3
+        config = dict(
+            clip_len=1,
+            frame_interval=1,
+            body_segments=4,
+            aug_segments=(2, 2),
+            aug_ratio=0.5,
+            temporal_jitter=False,
+            mode='val')
+        sample_frames = SampleProposalFrames(**config)
+        sample_frames_results = sample_frames(proposal_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 16
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={1}, '
+                                       f'body_segments={4}, '
+                                       f'aug_segments={(2, 2)}, '
+                                       f'aug_ratio={(0.5, 0.5)}, '
+                                       f'frame_interval={1}, '
+                                       f'test_interval={6}, '
+                                       f'temporal_jitter={False}, '
+                                       f'mode=val)')
+    """
+
+    def test_audio_feature_selector(self):
+        target_keys = ['audios']
+        # test frame selector with 2 dim input
+        inputs = copy.deepcopy(self.audio_feature_results)
+        inputs['frame_inds'] = np.arange(0, self.audio_total_frames,
+                                         2)[:, np.newaxis]
+        inputs['num_clips'] = 1
+        inputs['length'] = 1280
+        audio_feature_selector = AudioFeatureSelector()
+        results = audio_feature_selector(inputs)
+        assert assert_dict_has_keys(results, target_keys)
+        assert repr(audio_feature_selector) == (
+            f'{audio_feature_selector.__class__.__name__}('
+            f'fix_length={128})')
diff --git a/tests/datasets/transforms/test_text_transforms.py b/tests/datasets/transforms/test_text_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3444db34e32d824a8d06c20ef72f11019a3f430
--- /dev/null
+++ b/tests/datasets/transforms/test_text_transforms.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.datasets.transforms import CLIPTokenize
+
+
+class TestTextTransforms:
+
+    @staticmethod
+    def test_clip_tokenize():
+        results = {'text': 'Hello, MMAction2 2.0!'}
+        clip_tokenize = CLIPTokenize()
+        results = clip_tokenize(results)
+        assert results['text'].shape[0] == 77
+        assert results['text'].dtype == torch.int32
diff --git a/tests/datasets/transforms/test_wrappers.py b/tests/datasets/transforms/test_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a1099202b4622fb5a4b4d78abedb8d014230b5
--- /dev/null
+++ b/tests/datasets/transforms/test_wrappers.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+from mmengine.testing import assert_dict_has_keys
+from numpy.testing import assert_array_almost_equal
+
+from mmaction.datasets.transforms import CenterCrop, ImgAug
+
+
+def check_flip(origin_imgs, result_imgs, flip_type):
+    """Check if the origin_imgs are flipped correctly into result_imgs in
+    different flip_types."""
+    n, _, _, _ = np.shape(origin_imgs)
+    if flip_type == 'horizontal':
+        for i in range(n):
+            if np.any(result_imgs[i] != np.fliplr(origin_imgs[i])):
+                return False
+    else:
+        # yapf: disable
+        for i in range(n):
+            if np.any(result_imgs[i] != np.transpose(np.fliplr(np.transpose(origin_imgs[i], (1, 0, 2))), (1, 0, 2))):  # noqa:E501
+                return False
+        # yapf: enable
+    return True
+
+
+class TestAugumentations:
+
+    @staticmethod
+    def test_ImgAug():
+
+        with pytest.raises(ValueError):
+            # transforms only support one string, 'default'
+            ImgAug(transforms='test')
+
+        with pytest.raises(ValueError):
+            # transforms only support string or list of dicts
+            # or iaa.Augmenter object
+            ImgAug(transforms=dict(type='Rotate'))
+
+        with pytest.raises(AssertionError):
+            # each dict must have a `type` key
+            ImgAug(transforms=[dict(rotate=(-30, 30))])
+
+        with pytest.raises(AttributeError):
+            # `type` must be available in ImgAug
+            ImgAug(transforms=[dict(type='BlaBla')])
+
+        with pytest.raises(TypeError):
+            # `type` must be str or iaa available type
+            ImgAug(transforms=[dict(type=CenterCrop)])
+
+        from imgaug import augmenters as iaa
+
+        # check default configs
+        target_keys = ['imgs', 'img_shape', 'modality']
+        imgs = list(np.random.randint(0, 255, (1, 64, 64, 3)).astype(np.uint8))
+        results = dict(imgs=imgs, modality='RGB')
+        default_ImgAug = ImgAug(transforms='default')
+        default_results = default_ImgAug(results)
+        assert_dict_has_keys(default_results, target_keys)
+        assert default_results['img_shape'] == (64, 64)
+
+        # check flip (both images and bboxes)
+        target_keys = ['imgs', 'gt_bboxes', 'proposals', 'img_shape']
+        imgs = list(np.random.rand(1, 64, 64, 3).astype(np.float32))
+        results = dict(
+            imgs=imgs,
+            modality='RGB',
+            proposals=np.array([[0, 0, 25, 35]]),
+            img_shape=(64, 64),
+            gt_bboxes=np.array([[0, 0, 25, 35]]))
+        ImgAug_flip = ImgAug(transforms=[dict(type='Fliplr')])
+        flip_results = ImgAug_flip(results)
+        assert assert_dict_has_keys(flip_results, target_keys)
+        assert check_flip(imgs, flip_results['imgs'], 'horizontal')
+        assert_array_almost_equal(flip_results['gt_bboxes'],
+                                  np.array([[39, 0, 64, 35]]))
+        assert_array_almost_equal(flip_results['proposals'],
+                                  np.array([[39, 0, 64, 35]]))
+        transforms = iaa.Sequential([iaa.Fliplr()])
+        assert repr(ImgAug_flip) == f'ImgAug(transforms={transforms})'
+
+        # check crop (both images and bboxes)
+        target_keys = ['crop_bbox', 'gt_bboxes', 'imgs', 'img_shape']
+        imgs = list(np.random.rand(1, 122, 122, 3))
+        results = dict(
+            imgs=imgs,
+            modality='RGB',
+            img_shape=(122, 122),
+            gt_bboxes=np.array([[1.5, 2.5, 110, 64]]))
+        ImgAug_center_crop = ImgAug(transforms=[
+            dict(
+                type=iaa.CropToFixedSize,
+                width=100,
+                height=100,
+                position='center')
+        ])
+        crop_results = ImgAug_center_crop(results)
+        assert_dict_has_keys(crop_results, target_keys)
+        assert_array_almost_equal(crop_results['gt_bboxes'],
+                                  np.array([[0., 0., 99., 53.]]))
+        assert 'proposals' not in results
+        transforms = iaa.Sequential(
+            [iaa.CropToFixedSize(width=100, height=100, position='center')])
+        assert repr(ImgAug_center_crop) == f'ImgAug(transforms={transforms})'
+
+        # check resize (images only)
+        target_keys = ['imgs', 'img_shape']
+        imgs = list(np.random.rand(1, 64, 64, 3))
+        results = dict(imgs=imgs, modality='RGB')
+        transforms = iaa.Resize(32)
+        ImgAug_resize = ImgAug(transforms=transforms)
+        resize_results = ImgAug_resize(results)
+        assert_dict_has_keys(resize_results, target_keys)
+        assert resize_results['img_shape'] == (32, 32)
+        assert repr(ImgAug_resize) == f'ImgAug(transforms={transforms})'
diff --git a/tests/engine/optimizers/test_swin_optim_wrapper_constructor.py b/tests/engine/optimizers/test_swin_optim_wrapper_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9ceba67e03dbf509efc139405ce41530bb7932c
--- /dev/null
+++ b/tests/engine/optimizers/test_swin_optim_wrapper_constructor.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.engine.optimizers import SwinOptimWrapperConstructor
+
+
+class SubModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(2, 2, kernel_size=1, groups=2)
+        self.gn = nn.GroupNorm(2, 2)
+        self.fc = nn.Linear(2, 2)
+        self.param1 = nn.Parameter(torch.ones(1))
+
+
+class ExampleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.param1 = nn.Parameter(torch.ones(1))
+        self.conv1 = nn.Conv2d(3, 4, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv2d(4, 2, kernel_size=1)
+        self.bn = nn.BatchNorm2d(2)
+        self.sub = SubModel()
+        self.fc = nn.Linear(2, 1)
+
+
+base_lr = 0.01
+base_wd = 0.0001
+betas = (0.9, 0.999)
+
+
+def test_swin_optim_wrapper_constructor():
+    model = ExampleModel()
+    optim_wrapper_cfg = dict(
+        optimizer=dict(
+            type='AdamW', lr=base_lr, weight_decay=base_wd, betas=betas))
+    paramwise_cfg = {
+        'base.param1': dict(lr_mult=2.),
+        'base.conv1.weight': dict(lr_mult=3.),
+        'bn': dict(decay_mult=0.),
+        'sub': dict(lr_mult=0.1),
+        'sub.conv1.bias': dict(decay_mult=0.1),
+        'gn': dict(decay_mult=0.),
+    }
+    constructor = SwinOptimWrapperConstructor(optim_wrapper_cfg, paramwise_cfg)
+    optim_wrapper = constructor(model)
+
+    optimizer = optim_wrapper.optimizer
+    param_groups = optimizer.param_groups
+    assert isinstance(optimizer, torch.optim.AdamW)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['weight_decay'] == base_wd
+    model_parameters = list(model.parameters())
+    assert len(param_groups) == len(model_parameters)
+    for i, param in enumerate(model_parameters):
+        param_group = param_groups[i]
+        assert torch.equal(param_group['params'][0], param)
+        assert param_group['betas'] == betas
+
+    # param1
+    param1 = param_groups[0]
+    assert param1['lr'] == base_lr * paramwise_cfg['base.param1']['lr_mult']
+    assert param1['weight_decay'] == base_wd
+    # conv1.weight
+    conv1_weight = param_groups[1]
+    assert conv1_weight['lr'] == \
+           base_lr * paramwise_cfg['base.conv1.weight']['lr_mult']
+    assert conv1_weight['weight_decay'] == base_wd
+    # conv2.weight
+    conv2_weight = param_groups[2]
+    assert conv2_weight['lr'] == base_lr
+    assert conv2_weight['weight_decay'] == base_wd
+    # conv2.bias
+    conv2_bias = param_groups[3]
+    assert conv2_bias['lr'] == base_lr
+    assert conv2_bias['weight_decay'] == base_wd
+    # bn.weight
+    bn_weight = param_groups[4]
+    assert bn_weight['lr'] == base_lr
+    assert bn_weight['weight_decay'] == \
+           base_wd * paramwise_cfg['bn']['decay_mult']
+    # bn.bias
+    bn_bias = param_groups[5]
+    assert bn_bias['lr'] == base_lr
+    assert bn_bias['weight_decay'] == \
+           base_wd * paramwise_cfg['bn']['decay_mult']
+    # sub.param1
+    sub_param1 = param_groups[6]
+    assert sub_param1['lr'] == base_lr * paramwise_cfg['sub']['lr_mult']
+    assert sub_param1['weight_decay'] == base_wd
+    # sub.conv1.weight
+    sub_conv1_weight = param_groups[7]
+    assert sub_conv1_weight['lr'] == base_lr * paramwise_cfg['sub']['lr_mult']
+    assert sub_conv1_weight['weight_decay'] == base_wd
+    # sub.conv1.bias
+    sub_conv1_bias = param_groups[8]
+    assert sub_conv1_bias['lr'] == base_lr * paramwise_cfg['sub']['lr_mult']
+    assert sub_conv1_bias['weight_decay'] == \
+           base_wd * paramwise_cfg['sub.conv1.bias']['decay_mult']
+    # sub.gn.weight
+    sub_gn_weight = param_groups[9]
+    assert sub_gn_weight['lr'] == base_lr * paramwise_cfg['sub']['lr_mult']
+    assert sub_gn_weight['weight_decay'] == \
+           base_wd * paramwise_cfg['gn']['decay_mult']
+    # sub.gn.bias
+    sub_gn_bias = param_groups[10]
+    assert sub_gn_bias['lr'] == base_lr * paramwise_cfg['sub']['lr_mult']
+    assert sub_gn_bias['weight_decay'] == \
+           base_wd * paramwise_cfg['gn']['decay_mult']
+    # sub.fc.weight
+    sub_fc_weight = param_groups[11]
+    assert sub_fc_weight['lr'] == base_lr * paramwise_cfg['sub']['lr_mult']
+    assert sub_fc_weight['weight_decay'] == base_wd
+    # sub.fc.bias
+    sub_fc_bias = param_groups[12]
+    assert sub_fc_bias['lr'] == base_lr * paramwise_cfg['sub']['lr_mult']
+    assert sub_fc_bias['weight_decay'] == base_wd
+    # fc.weight
+    fc_weight = param_groups[13]
+    assert fc_weight['lr'] == base_lr
+    assert fc_weight['weight_decay'] == base_wd
+    # fc.bias
+    fc_bias = param_groups[14]
+    assert fc_bias['lr'] == base_lr
+    assert fc_bias['weight_decay'] == base_wd
diff --git a/tests/evaluation/metrics/test_acc_metric.py b/tests/evaluation/metrics/test_acc_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..e51ce0d61c62c2a24f00df509b188d9a46a54635
--- /dev/null
+++ b/tests/evaluation/metrics/test_acc_metric.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import platform
+from unittest import TestCase
+
+import numpy as np
+import pytest
+import torch
+from mmengine import load
+from numpy.testing import assert_array_almost_equal
+
+from mmaction.evaluation import AccMetric, ConfusionMatrix, MultiSportsMetric
+from mmaction.evaluation.functional import ava_eval
+from mmaction.registry import METRICS
+from mmaction.structures import ActionDataSample
+
+
+def generate_data(num_classes=5, random_label=False, multi_label=False):
+    data_batch = []
+    data_samples = []
+    for i in range(num_classes * 10):
+        scores = torch.randn(num_classes)
+        if multi_label:
+            label = torch.ones_like(scores)
+        elif random_label:
+            label = torch.randint(num_classes, size=[1])
+        else:
+            label = torch.LongTensor([scores.argmax().item()])
+        data_sample = dict(pred_score=scores, gt_label=label)
+        data_samples.append(data_sample)
+    return data_batch, data_samples
+
+
+def test_acc_metric():
+    num_classes = 32
+    metric = AccMetric(metric_list=('top_k_accuracy', 'mean_class_accuracy'))
+    data_batch, predictions = generate_data(
+        num_classes=num_classes, random_label=True)
+    metric.process(data_batch, predictions)
+    eval_results = metric.compute_metrics(metric.results)
+    assert 0.0 <= eval_results['top1'] <= eval_results['top5'] <= 1.0
+    assert 0.0 <= eval_results['mean1'] <= 1.0
+    metric.results.clear()
+
+    data_batch, predictions = generate_data(
+        num_classes=num_classes, random_label=False)
+    metric.process(data_batch, predictions)
+    eval_results = metric.compute_metrics(metric.results)
+    assert eval_results['top1'] == eval_results['top5'] == 1.0
+    assert eval_results['mean1'] == 1.0
+
+    metric = AccMetric(
+        metric_list=('mean_average_precision', 'mmit_mean_average_precision'))
+    data_batch, predictions = generate_data(
+        num_classes=num_classes, multi_label=True)
+    metric.process(data_batch, predictions)
+    eval_results = metric.compute_metrics(metric.results)
+    assert eval_results['mean_average_precision'] == 1.0
+    assert eval_results['mmit_mean_average_precision'] == 1.0
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Multiprocess Fail')
+def test_ava_detection():
+    data_prefix = osp.normpath(
+        osp.join(osp.dirname(__file__), '../../data/eval_detection'))
+
+    gt_path = osp.join(data_prefix, 'gt.csv')
+    result_path = osp.join(data_prefix, 'pred.csv')
+    label_map = osp.join(data_prefix, 'action_list.txt')
+
+    # eval bbox
+    detection = ava_eval(result_path, 'mAP', label_map, gt_path, None)
+    assert_array_almost_equal(detection['overall'], 0.09385522)
+
+
+def test_multisport_detection():
+    data_prefix = osp.normpath(
+        osp.join(osp.dirname(__file__), '../../data/eval_multisports'))
+
+    gt_path = osp.join(data_prefix, 'gt.pkl')
+    result_path = osp.join(data_prefix, 'data_samples.pkl')
+
+    result_datasamples = load(result_path)
+    metric = MultiSportsMetric(gt_path)
+    metric.process(None, result_datasamples)
+    eval_result = metric.compute_metrics(metric.results)
+    assert eval_result['frameAP'] == 83.6506
+    assert eval_result['v_map@0.2'] == 37.5
+    assert eval_result['v_map@0.5'] == 37.5
+    assert eval_result['v_map_0.10:0.90'] == 29.1667
+
+
+class TestConfusionMatrix(TestCase):
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+        pred = [
+            ActionDataSample().set_pred_score(i).set_pred_label(
+                j).set_gt_label(k).to_dict() for i, j, k in zip([
+                    torch.tensor([0.7, 0.0, 0.3]),
+                    torch.tensor([0.5, 0.2, 0.3]),
+                    torch.tensor([0.4, 0.5, 0.1]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                ], [0, 0, 1, 2, 2, 2], [0, 0, 1, 2, 1, 0])
+        ]
+
+        # Test with score (use score instead of label if score exists)
+        metric = METRICS.build(dict(type='ConfusionMatrix'))
+        metric.process(None, pred)
+        res = metric.evaluate(6)
+        self.assertIsInstance(res, dict)
+        self.assertTensorEqual(
+            res['confusion_matrix/result'],
+            torch.tensor([
+                [2, 0, 1],
+                [0, 1, 1],
+                [0, 0, 1],
+            ]))
+
+        # Test with label
+        for sample in pred:
+            del sample['pred_score']
+        metric = METRICS.build(dict(type='ConfusionMatrix'))
+        metric.process(None, pred)
+        with self.assertRaisesRegex(AssertionError,
+                                    'Please specify the `num_classes`'):
+            metric.evaluate(6)
+
+        metric = METRICS.build(dict(type='ConfusionMatrix', num_classes=3))
+        metric.process(None, pred)
+        self.assertIsInstance(res, dict)
+        self.assertTensorEqual(
+            res['confusion_matrix/result'],
+            torch.tensor([
+                [2, 0, 1],
+                [0, 1, 1],
+                [0, 0, 1],
+            ]))
+
+    def test_calculate(self):
+        y_true = np.array([0, 0, 1, 2, 1, 0])
+        y_label = torch.tensor([0, 0, 1, 2, 2, 2])
+        y_score = [
+            [0.7, 0.0, 0.3],
+            [0.5, 0.2, 0.3],
+            [0.4, 0.5, 0.1],
+            [0.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0],
+        ]
+
+        # Test with score
+        cm = ConfusionMatrix.calculate(y_score, y_true)
+        self.assertIsInstance(cm, torch.Tensor)
+        self.assertTensorEqual(
+            cm, torch.tensor([
+                [2, 0, 1],
+                [0, 1, 1],
+                [0, 0, 1],
+            ]))
+
+        # Test with label
+        with self.assertRaisesRegex(AssertionError,
+                                    'Please specify the `num_classes`'):
+            ConfusionMatrix.calculate(y_label, y_true)
+
+        cm = ConfusionMatrix.calculate(y_label, y_true, num_classes=3)
+        self.assertIsInstance(cm, torch.Tensor)
+        self.assertTensorEqual(
+            cm, torch.tensor([
+                [2, 0, 1],
+                [0, 1, 1],
+                [0, 0, 1],
+            ]))
+
+        # Test with invalid inputs
+        with self.assertRaisesRegex(TypeError, "<class 'str'> is not"):
+            ConfusionMatrix.calculate(y_label, 'hi')
+
+    def test_plot(self):
+        import matplotlib.pyplot as plt
+
+        cm = torch.tensor([[2, 0, 1], [0, 1, 1], [0, 0, 1]])
+        fig = ConfusionMatrix.plot(cm, include_values=True, show=False)
+
+        self.assertIsInstance(fig, plt.Figure)
+
+    def assertTensorEqual(self,
+                          tensor: torch.Tensor,
+                          value: float,
+                          msg=None,
+                          **kwarg):
+        tensor = tensor.to(torch.float32)
+        value = torch.tensor(value).float()
+        try:
+            torch.testing.assert_allclose(tensor, value, **kwarg)
+        except AssertionError as e:
+            self.fail(self._formatMessage(msg, str(e)))
diff --git a/tests/evaluation/metrics/test_metric_utils.py b/tests/evaluation/metrics/test_metric_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f24fcbbe1e931f65c2a96d11b902e04940d887c9
--- /dev/null
+++ b/tests/evaluation/metrics/test_metric_utils.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+from mmaction.evaluation.functional import (average_recall_at_avg_proposals,
+                                            confusion_matrix,
+                                            get_weighted_score,
+                                            pairwise_temporal_iou,
+                                            top_k_classes)
+
+
+def test_top_k_accurate_classes():
+    scores = [
+        np.array([0.1, 0.2, 0.3, 0.4]),  # 3
+        np.array([0.2, 0.3, 0.4, 0.1]),  # 2
+        np.array([0.3, 0.4, 0.1, 0.2]),  # 1
+        np.array([0.4, 0.1, 0.2, 0.3]),  # 0
+        np.array([0.25, 0.1, 0.3, 0.35]),  # 3
+        np.array([0.2, 0.15, 0.3, 0.35]),  # 3
+    ]
+    label = np.array([3, 2, 2, 1, 3, 3], dtype=np.int64)
+
+    with pytest.raises(AssertionError):
+        top_k_classes(scores, label, 1, mode='wrong')
+
+    results_top1 = top_k_classes(scores, label, 1)
+    results_top3 = top_k_classes(scores, label, 3)
+    assert len(results_top1) == 1
+    assert len(results_top3) == 3
+    assert results_top3[0] == results_top1[0]
+    assert results_top1 == [(3, 1.)]
+    assert results_top3 == [(3, 1.), (2, 0.5), (1, 0.0)]
+
+    label = np.array([3, 2, 1, 1, 3, 0], dtype=np.int64)
+    results_top1 = top_k_classes(scores, label, 1, mode='inaccurate')
+    results_top3 = top_k_classes(scores, label, 3, mode='inaccurate')
+    assert len(results_top1) == 1
+    assert len(results_top3) == 3
+    assert results_top3[0] == results_top1[0]
+    assert results_top1 == [(0, 0.)]
+    assert results_top3 == [(0, 0.0), (1, 0.5), (2, 1.0)]
+
+
+def test_pairwise_temporal_iou():
+    target_segments = np.array([])
+    candidate_segments = np.array([])
+    with pytest.raises(ValueError):
+        pairwise_temporal_iou(target_segments, candidate_segments)
+
+    # test temporal iou
+    target_segments = np.array([[1, 2], [2, 3]])
+    candidate_segments = np.array([[2, 3], [2.5, 3]])
+    temporal_iou = pairwise_temporal_iou(candidate_segments, target_segments)
+    assert_array_equal(temporal_iou, [[0, 0], [1, 0.5]])
+
+    # test temporal overlap_self
+    target_segments = np.array([[1, 2], [2, 3]])
+    candidate_segments = np.array([[2, 3], [2.5, 3]])
+    temporal_iou, temporal_overlap_self = pairwise_temporal_iou(
+        candidate_segments, target_segments, calculate_overlap_self=True)
+    assert_array_equal(temporal_overlap_self, [[0, 0], [1, 1]])
+
+    # test temporal overlap_self when candidate_segments is 1d
+    target_segments = np.array([[1, 2], [2, 3]])
+    candidate_segments = np.array([2.5, 3])
+    temporal_iou, temporal_overlap_self = pairwise_temporal_iou(
+        candidate_segments, target_segments, calculate_overlap_self=True)
+    assert_array_equal(temporal_overlap_self, [0, 1])
+
+
+def test_average_recall_at_avg_proposals():
+    ground_truth1 = {
+        'v_test1': np.array([[0, 1], [1, 2]]),
+        'v_test2': np.array([[0, 1], [1, 2]])
+    }
+    ground_truth2 = {'v_test1': np.array([[0, 1]])}
+    proposals1 = {
+        'v_test1': np.array([[0, 1, 1], [1, 2, 1]]),
+        'v_test2': np.array([[0, 1, 1], [1, 2, 1]])
+    }
+    proposals2 = {
+        'v_test1': np.array([[10, 11, 0.6], [11, 12, 0.4]]),
+        'v_test2': np.array([[10, 11, 0.6], [11, 12, 0.4]])
+    }
+    proposals3 = {
+        'v_test1': np.array([[i, i + 1, 1 / (i + 1)] for i in range(100)])
+    }
+
+    recall, avg_recall, proposals_per_video, auc = (
+        average_recall_at_avg_proposals(ground_truth1, proposals1, 4))
+    assert_array_equal(recall, [[0.] * 49 + [0.5] * 50 + [1.]] * 10)
+    assert_array_equal(avg_recall, [0.] * 49 + [0.5] * 50 + [1.])
+    assert_array_almost_equal(
+        proposals_per_video, np.arange(0.02, 2.02, 0.02), decimal=10)
+    assert auc == 25.5
+
+    recall, avg_recall, proposals_per_video, auc = (
+        average_recall_at_avg_proposals(ground_truth1, proposals2, 4))
+    assert_array_equal(recall, [[0.] * 100] * 10)
+    assert_array_equal(avg_recall, [0.] * 100)
+    assert_array_almost_equal(
+        proposals_per_video, np.arange(0.02, 2.02, 0.02), decimal=10)
+    assert auc == 0
+
+    recall, avg_recall, proposals_per_video, auc = (
+        average_recall_at_avg_proposals(ground_truth2, proposals3, 100))
+    assert_array_equal(recall, [[1.] * 100] * 10)
+    assert_array_equal(avg_recall, ([1.] * 100))
+    assert_array_almost_equal(
+        proposals_per_video, np.arange(1, 101, 1), decimal=10)
+    assert auc == 99.0
+
+
+def test_get_weighted_score():
+    score_a = [
+        np.array([-0.2203, -0.7538, 1.8789, 0.4451, -0.2526]),
+        np.array([-0.0413, 0.6366, 1.1155, 0.3484, 0.0395]),
+        np.array([0.0365, 0.5158, 1.1067, -0.9276, -0.2124]),
+        np.array([0.6232, 0.9912, -0.8562, 0.0148, 1.6413])
+    ]
+    score_b = [
+        np.array([-0.0413, 0.6366, 1.1155, 0.3484, 0.0395]),
+        np.array([0.0365, 0.5158, 1.1067, -0.9276, -0.2124]),
+        np.array([0.6232, 0.9912, -0.8562, 0.0148, 1.6413]),
+        np.array([-0.2203, -0.7538, 1.8789, 0.4451, -0.2526])
+    ]
+    weighted_score = get_weighted_score([score_a], [1])
+    assert np.all(np.isclose(np.array(score_a), np.array(weighted_score)))
+    coeff_a, coeff_b = 2., 1.
+    weighted_score = get_weighted_score([score_a, score_b], [coeff_a, coeff_b])
+    ground_truth = [
+        x * coeff_a + y * coeff_b for x, y in zip(score_a, score_b)
+    ]
+    assert np.all(np.isclose(np.array(ground_truth), np.array(weighted_score)))
+
+
+def gt_confusion_matrix(gt_labels, pred_labels, normalize=None):
+    """Calculate the ground truth confusion matrix."""
+    max_index = max(max(gt_labels), max(pred_labels))
+    confusion_mat = np.zeros((max_index + 1, max_index + 1), dtype=np.int64)
+    for gt, pred in zip(gt_labels, pred_labels):
+        confusion_mat[gt][pred] += 1
+    del_index = []
+    for i in range(max_index):
+        if sum(confusion_mat[i]) == 0 and sum(confusion_mat[:, i]) == 0:
+            del_index.append(i)
+    confusion_mat = np.delete(confusion_mat, del_index, axis=0)
+    confusion_mat = np.delete(confusion_mat, del_index, axis=1)
+
+    if normalize is not None:
+        confusion_mat = np.array(confusion_mat, dtype=np.float64)
+    m, n = confusion_mat.shape
+    if normalize == 'true':
+        for i in range(m):
+            s = np.sum(confusion_mat[i], dtype=float)
+            if s == 0:
+                continue
+            confusion_mat[i, :] = confusion_mat[i, :] / s
+            print(confusion_mat[i, :])
+    elif normalize == 'pred':
+        for i in range(n):
+            s = sum(confusion_mat[:, i])
+            if s == 0:
+                continue
+            confusion_mat[:, i] = confusion_mat[:, i] / s
+    elif normalize == 'all':
+        s = np.sum(confusion_mat)
+        if s != 0:
+            confusion_mat /= s
+
+    return confusion_mat
+
+
+def test_confusion_matrix():
+    # custom confusion_matrix
+    gt_labels = [np.int64(random.randint(0, 9)) for _ in range(100)]
+    pred_labels = np.random.randint(10, size=100, dtype=np.int64)
+
+    for normalize in [None, 'true', 'pred', 'all']:
+        cf_mat = confusion_matrix(pred_labels, gt_labels, normalize)
+        gt_cf_mat = gt_confusion_matrix(gt_labels, pred_labels, normalize)
+        assert_array_equal(cf_mat, gt_cf_mat)
+
+    with pytest.raises(ValueError):
+        # normalize must be in ['true', 'pred', 'all', None]
+        confusion_matrix([1], [1], 'unsupport')
+
+    with pytest.raises(TypeError):
+        # y_pred must be list or np.ndarray
+        confusion_matrix(0.5, [1])
+
+    with pytest.raises(TypeError):
+        # y_real must be list or np.ndarray
+        confusion_matrix([1], 0.5)
+
+    with pytest.raises(TypeError):
+        # y_pred dtype must be np.int64
+        confusion_matrix([0.5], [1])
+
+    with pytest.raises(TypeError):
+        # y_real dtype must be np.int64
+        confusion_matrix([1], [0.5])
diff --git a/tests/evaluation/metrics/test_retrieval_metric.py b/tests/evaluation/metrics/test_retrieval_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4709b723a02ff4165bf4de3551e6de736decc17
--- /dev/null
+++ b/tests/evaluation/metrics/test_retrieval_metric.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+import pytest
+import torch
+
+from mmaction.evaluation.metrics import RetrievalMetric, RetrievalRecall
+from mmaction.registry import METRICS
+from mmaction.structures import ActionDataSample
+
+
+def generate_data(num_samples=5, feat_dim=10, random_label=False):
+    data_batch = []
+    data_samples = []
+    for i in range(num_samples):
+        if random_label:
+            video_feature = torch.randn(feat_dim)
+            text_feature = torch.randn(feat_dim)
+        else:
+            video_feature = torch.randn(feat_dim)
+            text_feature = video_feature.clone()
+
+        data_sample = dict(
+            features=dict(
+                video_feature=video_feature, text_feature=text_feature))
+        data_samples.append(data_sample)
+    return data_batch, data_samples
+
+
+def test_acc_metric():
+    with pytest.raises(ValueError):
+        RetrievalMetric(metric_list='R100')
+
+    num_samples = 20
+    metric = RetrievalMetric()
+    data_batch, predictions = generate_data(
+        num_samples=num_samples, random_label=True)
+    metric.process(data_batch, predictions)
+    eval_results = metric.compute_metrics(metric.results)
+    assert 0.0 <= eval_results['R1'] <= eval_results['R5'] <= eval_results[
+        'R10'] <= 100.0
+    assert 0.0 <= eval_results['MdR'] <= num_samples
+    assert 0.0 <= eval_results['MnR'] <= num_samples
+
+    metric.results.clear()
+
+    data_batch, predictions = generate_data(
+        num_samples=num_samples, random_label=False)
+    metric.process(data_batch, predictions)
+    eval_results = metric.compute_metrics(metric.results)
+    assert eval_results['R1'] == eval_results['R5'] == eval_results[
+        'R10'] == 100.0
+    assert eval_results['MdR'] == eval_results['MnR'] == 1.0
+
+
+class TestRetrievalRecall(TestCase):
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+        pred = [
+            ActionDataSample().set_pred_score(i).set_gt_label(k).to_dict()
+            for i, k in zip([
+                torch.tensor([0.7, 0.0, 0.3]),
+                torch.tensor([0.5, 0.2, 0.3]),
+                torch.tensor([0.4, 0.5, 0.1]),
+                torch.tensor([0.0, 0.0, 1.0]),
+                torch.tensor([0.0, 0.0, 1.0]),
+                torch.tensor([0.0, 0.0, 1.0]),
+            ], [[0], [0], [1], [2], [2], [0]])
+        ]
+
+        # Test with score (use score instead of label if score exists)
+        metric = METRICS.build(dict(type='RetrievalRecall', topk=1))
+        metric.process(None, pred)
+        recall = metric.evaluate(6)
+        self.assertIsInstance(recall, dict)
+        self.assertAlmostEqual(
+            recall['retrieval/Recall@1'], 5 / 6 * 100, places=4)
+
+        # Test with invalid topk
+        with self.assertRaisesRegex(RuntimeError, 'selected index k'):
+            metric = METRICS.build(dict(type='RetrievalRecall', topk=10))
+            metric.process(None, pred)
+            metric.evaluate(6)
+
+        with self.assertRaisesRegex(ValueError, '`topk` must be a'):
+            METRICS.build(dict(type='RetrievalRecall', topk=-1))
+
+        # Test initialization
+        metric = METRICS.build(dict(type='RetrievalRecall', topk=5))
+        self.assertEqual(metric.topk, (5, ))
+
+        # Test initialization
+        metric = METRICS.build(dict(type='RetrievalRecall', topk=(1, 2, 5)))
+        self.assertEqual(metric.topk, (1, 2, 5))
+
+    def test_calculate(self):
+        """Test using the metric from static method."""
+
+        # seq of indices format
+        y_true = [[0, 2, 5, 8, 9], [1, 4, 6]]
+        y_pred = [np.arange(10)] * 2
+
+        # test with average is 'macro'
+        recall_score = RetrievalRecall.calculate(
+            y_pred, y_true, topk=1, pred_indices=True, target_indices=True)
+        expect_recall = 50.
+        self.assertEqual(recall_score[0].item(), expect_recall)
+
+        # test with tensor input
+        y_true = torch.Tensor([[1, 0, 1, 0, 0, 1, 0, 0, 1, 1],
+                               [0, 1, 0, 0, 1, 0, 1, 0, 0, 0]])
+        y_pred = np.array([np.linspace(0.95, 0.05, 10)] * 2)
+        recall_score = RetrievalRecall.calculate(y_pred, y_true, topk=1)
+        expect_recall = 50.
+        self.assertEqual(recall_score[0].item(), expect_recall)
+
+        # test with topk is 5
+        y_pred = np.array([np.linspace(0.95, 0.05, 10)] * 2)
+        recall_score = RetrievalRecall.calculate(y_pred, y_true, topk=2)
+        expect_recall = 100.
+        self.assertEqual(recall_score[0].item(), expect_recall)
+
+        # test with topk is (1, 5)
+        y_pred = np.array([np.linspace(0.95, 0.05, 10)] * 2)
+        recall_score = RetrievalRecall.calculate(y_pred, y_true, topk=(1, 5))
+        expect_recalls = [50., 100.]
+        self.assertEqual(len(recall_score), len(expect_recalls))
+        for i in range(len(expect_recalls)):
+            self.assertEqual(recall_score[i].item(), expect_recalls[i])
+
+        # Test with invalid pred
+        y_pred = dict()
+        y_true = [[0, 2, 5, 8, 9], [1, 4, 6]]
+        with self.assertRaisesRegex(AssertionError, '`pred` must be Seq'):
+            RetrievalRecall.calculate(y_pred, y_true, True, True)
+
+        # Test with invalid target
+        y_true = dict()
+        y_pred = [np.arange(10)] * 2
+        with self.assertRaisesRegex(AssertionError, '`target` must be Seq'):
+            RetrievalRecall.calculate(
+                y_pred, y_true, topk=1, pred_indices=True, target_indices=True)
+
+        # Test with different length `pred` with `target`
+        y_true = [[0, 2, 5, 8, 9], [1, 4, 6]]
+        y_pred = [np.arange(10)] * 3
+        with self.assertRaisesRegex(AssertionError, 'Length of `pred`'):
+            RetrievalRecall.calculate(
+                y_pred, y_true, topk=1, pred_indices=True, target_indices=True)
+
+        # Test with invalid pred
+        y_true = [[0, 2, 5, 8, 9], dict()]
+        y_pred = [np.arange(10)] * 2
+        with self.assertRaisesRegex(AssertionError, '`target` should be'):
+            RetrievalRecall.calculate(
+                y_pred, y_true, topk=1, pred_indices=True, target_indices=True)
+
+        # Test with invalid target
+        y_true = [[0, 2, 5, 8, 9], [1, 4, 6]]
+        y_pred = [np.arange(10), dict()]
+        with self.assertRaisesRegex(AssertionError, '`pred` should be'):
+            RetrievalRecall.calculate(
+                y_pred, y_true, topk=1, pred_indices=True, target_indices=True)
diff --git a/tests/models/backbones/__init__.py b/tests/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/tests/models/backbones/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/tests/models/backbones/test_aagcn.py b/tests/models/backbones/test_aagcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eed3341c90273377521c82e83a9f82d1c3073d8
--- /dev/null
+++ b/tests/models/backbones/test_aagcn.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models import AAGCN
+from mmaction.utils import register_all_modules
+
+
+def test_aagcn_backbone():
+    """Test AAGCN backbone."""
+
+    register_all_modules()
+
+    mode = 'spatial'
+    batch_size, num_person, num_frames = 2, 2, 150
+
+    # openpose-18 layout
+    num_joints = 18
+    model = AAGCN(graph_cfg=dict(layout='openpose', mode=mode))
+    model.init_weights()
+    inputs = torch.randn(batch_size, num_person, num_frames, num_joints, 3)
+    output = model(inputs)
+    assert output.shape == torch.Size([2, 2, 256, 38, 18])
+
+    # nturgb+d layout
+    num_joints = 25
+    model = AAGCN(graph_cfg=dict(layout='nturgb+d', mode=mode))
+    model.init_weights()
+    inputs = torch.randn(batch_size, num_person, num_frames, num_joints, 3)
+    output = model(inputs)
+    assert output.shape == torch.Size([2, 2, 256, 38, 25])
+
+    # coco layout
+    num_joints = 17
+    model = AAGCN(graph_cfg=dict(layout='coco', mode=mode))
+    model.init_weights()
+    inputs = torch.randn(batch_size, num_person, num_frames, num_joints, 3)
+    output = model(inputs)
+    assert output.shape == torch.Size([2, 2, 256, 38, 17])
+
+    # custom settings
+    # disable the attention module to degenerate AAGCN to AGCN
+    model = AAGCN(
+        graph_cfg=dict(layout='coco', mode=mode), gcn_attention=False)
+    model.init_weights()
+    output = model(inputs)
+    assert output.shape == torch.Size([2, 2, 256, 38, 17])
diff --git a/tests/models/backbones/test_c2d.py b/tests/models/backbones/test_c2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe5d7a484fe9f411f125a0672d2fc0215256812a
--- /dev/null
+++ b/tests/models/backbones/test_c2d.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models import C2D
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_c2d_backbone():
+    """Test c2d backbone."""
+    input_shape = (1, 3, 8, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+
+    # c2d inference test
+    c2d_r50 = C2D(depth=50)
+    c2d_r50.init_weights()
+    c2d_r50.train()
+    feat = c2d_r50(imgs)
+    assert feat.shape == torch.Size([1, 2048, 4, 2, 2])
+
+    c2d_r101 = C2D(depth=101)
+    c2d_r101.init_weights()
+    c2d_r101.train()
+    feat = c2d_r101(imgs)
+    assert feat.shape == torch.Size([1, 2048, 4, 2, 2])
diff --git a/tests/models/backbones/test_c3d.py b/tests/models/backbones/test_c3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ca7879d63a155c4f5de30273579fac0c64e10b3
--- /dev/null
+++ b/tests/models/backbones/test_c3d.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models import C3D
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_c3d_backbone():
+    """Test c3d backbone."""
+    input_shape = (1, 3, 16, 24, 24)
+    imgs = generate_backbone_demo_inputs(input_shape)
+
+    # c3d inference test
+    c3d = C3D(out_dim=512)
+    c3d.init_weights()
+    c3d.train()
+    feat = c3d(imgs)
+    assert feat.shape == torch.Size([1, 4096])
+
+    # c3d with bn inference test
+    c3d_bn = C3D(out_dim=512, norm_cfg=dict(type='BN3d'))
+    c3d_bn.init_weights()
+    c3d_bn.train()
+    feat = c3d_bn(imgs)
+    assert feat.shape == torch.Size([1, 4096])
diff --git a/tests/models/backbones/test_mobilenet_v2.py b/tests/models/backbones/test_mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d57b8f9760e9f602fbe0fd47e634b23765e4f4
--- /dev/null
+++ b/tests/models/backbones/test_mobilenet_v2.py
@@ -0,0 +1,218 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.models import MobileNetV2
+from mmaction.testing import check_norm_state, generate_backbone_demo_inputs
+
+
+def test_mobilenetv2_backbone():
+    """Test MobileNetV2.
+
+    Modified from mmclassification.
+    """
+    from torch.nn.modules import GroupNorm
+
+    from mmaction.models.backbones.mobilenet_v2 import InvertedResidual
+
+    def is_norm(modules):
+        """Check if is one of the norms."""
+        if isinstance(modules, (GroupNorm, _BatchNorm)):
+            return True
+        return False
+
+    def is_block(modules):
+        """Check if is ResNet building block."""
+        if isinstance(modules, (InvertedResidual, )):
+            return True
+        return False
+
+    with pytest.raises(TypeError):
+        # pretrained must be a string path
+        model = MobileNetV2()
+        model.init_weights(pretrained=0)
+
+    with pytest.raises(ValueError):
+        # frozen_stages must in range(1, 9)
+        MobileNetV2(frozen_stages=9)
+
+    with pytest.raises(ValueError):
+        # tout_indices in range(-1, 8)
+        MobileNetV2(out_indices=[8])
+
+    input_shape = (1, 3, 224, 224)
+    imgs = generate_backbone_demo_inputs(input_shape)
+
+    # Test MobileNetV2 with first stage frozen
+    frozen_stages = 1
+    model = MobileNetV2(frozen_stages=frozen_stages)
+    model.init_weights()
+    model.train()
+
+    for mod in model.conv1.modules():
+        for param in mod.parameters():
+            assert param.requires_grad is False
+    for i in range(1, frozen_stages + 1):
+        layer = getattr(model, f'layer{i}')
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # Test MobileNetV2 with all stages frozen
+    frozen_stages = 8
+    model = MobileNetV2(frozen_stages=frozen_stages)
+    model.init_weights()
+    model.train()
+
+    for mod in model.modules():
+        if not isinstance(mod, MobileNetV2):
+            assert mod.training is False
+        for param in mod.parameters():
+            assert param.requires_grad is False
+
+    # Test MobileNetV2 with norm_eval=True
+    model = MobileNetV2(norm_eval=True)
+    model.init_weights()
+    model.train()
+
+    assert check_norm_state(model.modules(), False)
+
+    # Test MobileNetV2 forward with widen_factor=1.0, pretrained
+    model = MobileNetV2(
+        widen_factor=1.0,
+        out_indices=range(0, 8),
+        pretrained='mmcls://mobilenet_v2')
+    model.init_weights()
+    model.train()
+
+    assert check_norm_state(model.modules(), True)
+
+    feat = model(imgs)
+    assert len(feat) == 8
+    assert feat[0].shape == torch.Size((1, 16, 112, 112))
+    assert feat[1].shape == torch.Size((1, 24, 56, 56))
+    assert feat[2].shape == torch.Size((1, 32, 28, 28))
+    assert feat[3].shape == torch.Size((1, 64, 14, 14))
+    assert feat[4].shape == torch.Size((1, 96, 14, 14))
+    assert feat[5].shape == torch.Size((1, 160, 7, 7))
+    assert feat[6].shape == torch.Size((1, 320, 7, 7))
+    assert feat[7].shape == torch.Size((1, 1280, 7, 7))
+
+    # Test MobileNetV2 forward with widen_factor=0.5
+    model = MobileNetV2(widen_factor=0.5, out_indices=range(0, 7))
+    model.init_weights()
+    model.train()
+
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size((1, 8, 112, 112))
+    assert feat[1].shape == torch.Size((1, 16, 56, 56))
+    assert feat[2].shape == torch.Size((1, 16, 28, 28))
+    assert feat[3].shape == torch.Size((1, 32, 14, 14))
+    assert feat[4].shape == torch.Size((1, 48, 14, 14))
+    assert feat[5].shape == torch.Size((1, 80, 7, 7))
+    assert feat[6].shape == torch.Size((1, 160, 7, 7))
+
+    # Test MobileNetV2 forward with widen_factor=2.0
+    model = MobileNetV2(widen_factor=2.0)
+    model.init_weights()
+    model.train()
+
+    feat = model(imgs)
+    assert feat.shape == torch.Size((1, 2560, 7, 7))
+
+    # Test MobileNetV2 forward with out_indices=None
+    model = MobileNetV2(widen_factor=1.0)
+    model.init_weights()
+    model.train()
+
+    feat = model(imgs)
+    assert feat.shape == torch.Size((1, 1280, 7, 7))
+
+    # Test MobileNetV2 forward with dict(type='ReLU')
+    model = MobileNetV2(
+        widen_factor=1.0, act_cfg=dict(type='ReLU'), out_indices=range(0, 7))
+    model.init_weights()
+    model.train()
+
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size((1, 16, 112, 112))
+    assert feat[1].shape == torch.Size((1, 24, 56, 56))
+    assert feat[2].shape == torch.Size((1, 32, 28, 28))
+    assert feat[3].shape == torch.Size((1, 64, 14, 14))
+    assert feat[4].shape == torch.Size((1, 96, 14, 14))
+    assert feat[5].shape == torch.Size((1, 160, 7, 7))
+    assert feat[6].shape == torch.Size((1, 320, 7, 7))
+
+    # Test MobileNetV2 with GroupNorm forward
+    model = MobileNetV2(widen_factor=1.0, out_indices=range(0, 7))
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+    model.init_weights()
+    model.train()
+
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size((1, 16, 112, 112))
+    assert feat[1].shape == torch.Size((1, 24, 56, 56))
+    assert feat[2].shape == torch.Size((1, 32, 28, 28))
+    assert feat[3].shape == torch.Size((1, 64, 14, 14))
+    assert feat[4].shape == torch.Size((1, 96, 14, 14))
+    assert feat[5].shape == torch.Size((1, 160, 7, 7))
+    assert feat[6].shape == torch.Size((1, 320, 7, 7))
+
+    # Test MobileNetV2 with BatchNorm forward
+    model = MobileNetV2(
+        widen_factor=1.0,
+        norm_cfg=dict(type='GN', num_groups=2, requires_grad=True),
+        out_indices=range(0, 7))
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, GroupNorm)
+    model.init_weights()
+    model.train()
+
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size((1, 16, 112, 112))
+    assert feat[1].shape == torch.Size((1, 24, 56, 56))
+    assert feat[2].shape == torch.Size((1, 32, 28, 28))
+    assert feat[3].shape == torch.Size((1, 64, 14, 14))
+    assert feat[4].shape == torch.Size((1, 96, 14, 14))
+    assert feat[5].shape == torch.Size((1, 160, 7, 7))
+    assert feat[6].shape == torch.Size((1, 320, 7, 7))
+
+    # Test MobileNetV2 with layers 1, 3, 5 out forward
+    model = MobileNetV2(widen_factor=1.0, out_indices=(0, 2, 4))
+    model.init_weights()
+    model.train()
+
+    feat = model(imgs)
+    assert len(feat) == 3
+    assert feat[0].shape == torch.Size((1, 16, 112, 112))
+    assert feat[1].shape == torch.Size((1, 32, 28, 28))
+    assert feat[2].shape == torch.Size((1, 96, 14, 14))
+
+    # Test MobileNetV2 with checkpoint forward
+    model = MobileNetV2(
+        widen_factor=1.0, with_cp=True, out_indices=range(0, 7))
+    for m in model.modules():
+        if is_block(m):
+            assert m.with_cp
+    model.init_weights()
+    model.train()
+
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size((1, 16, 112, 112))
+    assert feat[1].shape == torch.Size((1, 24, 56, 56))
+    assert feat[2].shape == torch.Size((1, 32, 28, 28))
+    assert feat[3].shape == torch.Size((1, 64, 14, 14))
+    assert feat[4].shape == torch.Size((1, 96, 14, 14))
+    assert feat[5].shape == torch.Size((1, 160, 7, 7))
+    assert feat[6].shape == torch.Size((1, 320, 7, 7))
diff --git a/tests/models/backbones/test_mobilenet_v2_tsm.py b/tests/models/backbones/test_mobilenet_v2_tsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..57e1004ebdd6dd85fd04a6ab9ceaeb4ac380a931
--- /dev/null
+++ b/tests/models/backbones/test_mobilenet_v2_tsm.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models import MobileNetV2TSM
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_mobilenetv2_tsm_backbone():
+    """Test mobilenetv2_tsm backbone."""
+    from mmcv.cnn import ConvModule
+
+    from mmaction.models.backbones.mobilenet_v2 import InvertedResidual
+    from mmaction.models.backbones.resnet_tsm import TemporalShift
+
+    input_shape = (8, 3, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+
+    # mobilenetv2_tsm with width_mult = 1.0
+    mobilenetv2_tsm = MobileNetV2TSM(pretrained='mmcls://mobilenet_v2')
+    mobilenetv2_tsm.init_weights()
+    for cur_module in mobilenetv2_tsm.modules():
+        if isinstance(cur_module, InvertedResidual) and \
+            len(cur_module.conv) == 3 and \
+                cur_module.use_res_connect:
+            assert isinstance(cur_module.conv[0], TemporalShift)
+            assert cur_module.conv[0].num_segments == \
+                mobilenetv2_tsm.num_segments
+            assert cur_module.conv[0].shift_div == mobilenetv2_tsm.shift_div
+            assert isinstance(cur_module.conv[0].net, ConvModule)
+
+    # TSM-MobileNetV2 with widen_factor = 1.0 forword
+    feat = mobilenetv2_tsm(imgs)
+    assert feat.shape == torch.Size([8, 1280, 2, 2])
+
+    # mobilenetv2 with widen_factor = 0.5 forword
+    mobilenetv2_tsm_05 = MobileNetV2TSM(widen_factor=0.5, pretrained2d=False)
+    mobilenetv2_tsm_05.init_weights()
+    feat = mobilenetv2_tsm_05(imgs)
+    assert feat.shape == torch.Size([8, 1280, 2, 2])
+
+    # mobilenetv2 with widen_factor = 1.5 forword
+    mobilenetv2_tsm_15 = MobileNetV2TSM(widen_factor=1.5, pretrained2d=False)
+    mobilenetv2_tsm_15.init_weights()
+    feat = mobilenetv2_tsm_15(imgs)
+    assert feat.shape == torch.Size([8, 1920, 2, 2])
diff --git a/tests/models/backbones/test_mobileone_tsm.py b/tests/models/backbones/test_mobileone_tsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4d6ea87a3203037cb524c3878ca71264ee06a66
--- /dev/null
+++ b/tests/models/backbones/test_mobileone_tsm.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import tempfile
+
+import torch
+from mmengine.runner import load_checkpoint, save_checkpoint
+from mmengine.runner.checkpoint import _load_checkpoint_with_prefix
+
+from mmaction.models.backbones.mobileone_tsm import MobileOneTSM
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_mobileone_tsm_backbone():
+    """Test MobileOne TSM backbone."""
+
+    from mmpretrain.models.backbones.mobileone import MobileOneBlock
+
+    from mmaction.models.backbones.resnet_tsm import TemporalShift
+
+    model = MobileOneTSM('s0', pretrained2d=False)
+    model.init_weights()
+    for cur_module in model.modules():
+        if isinstance(cur_module, TemporalShift):
+            # TemporalShift is a wrapper of MobileOneBlock
+            assert isinstance(cur_module.net, MobileOneBlock)
+            assert cur_module.num_segments == model.num_segments
+            assert cur_module.shift_div == model.shift_div
+
+    inputs = generate_backbone_demo_inputs((8, 3, 64, 64))
+
+    feat = model(inputs)
+    assert feat.shape == torch.Size([8, 1024, 2, 2])
+
+    model = MobileOneTSM('s1', pretrained2d=False)
+    feat = model(inputs)
+    assert feat.shape == torch.Size([8, 1280, 2, 2])
+
+    model = MobileOneTSM('s2', pretrained2d=False)
+    feat = model(inputs)
+    assert feat.shape == torch.Size([8, 2048, 2, 2])
+
+    model = MobileOneTSM('s3', pretrained2d=False)
+    feat = model(inputs)
+    assert feat.shape == torch.Size([8, 2048, 2, 2])
+
+    model = MobileOneTSM('s4', pretrained2d=False)
+    feat = model(inputs)
+    assert feat.shape == torch.Size([8, 2048, 2, 2])
+
+
+def test_mobileone_init_weight():
+    checkpoint = ('https://download.openmmlab.com/mmclassification/v0'
+                  '/mobileone/mobileone-s0_8xb32_in1k_20221110-0bc94952.pth')
+    # ckpt = torch.load(checkpoint)['state_dict']
+    model = MobileOneTSM(
+        arch='s0',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint, prefix='backbone'))
+    model.init_weights()
+    ori_ckpt = _load_checkpoint_with_prefix(
+        'backbone', model.init_cfg['checkpoint'], map_location='cpu')
+    for name, param in model.named_parameters():
+        ori_name = name.replace('.net', '')
+        assert torch.allclose(param, ori_ckpt[ori_name]), \
+            f'layer {name} fail to load from pretrained checkpoint'
+
+
+def test_load_deploy_mobileone():
+    # Test output before and load from deploy checkpoint
+    model = MobileOneTSM('s0', pretrained2d=False)
+    inputs = generate_backbone_demo_inputs((8, 3, 64, 64))
+    tmpdir = tempfile.gettempdir()
+    ckpt_path = os.path.join(tmpdir, 'ckpt.pth')
+    model.switch_to_deploy()
+    model.eval()
+    outputs = model(inputs)
+
+    model_deploy = MobileOneTSM('s0', pretrained2d=False, deploy=True)
+    save_checkpoint(model.state_dict(), ckpt_path)
+    load_checkpoint(model_deploy, ckpt_path)
+
+    outputs_load = model_deploy(inputs)
+    for feat, feat_load in zip(outputs, outputs_load):
+        assert torch.allclose(feat, feat_load)
+    os.remove(ckpt_path)
diff --git a/tests/models/backbones/test_mvit.py b/tests/models/backbones/test_mvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..bff1d5d7382234b98acf29d43f6c8b1d0ee856d2
--- /dev/null
+++ b/tests/models/backbones/test_mvit.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+
+from mmaction.models import MViT
+
+
+class TestMViT(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(arch='tiny', drop_path_rate=0.1)
+
+    def test_structure(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'not in default archs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            MViT(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'Custom arch needs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'num_layers': 24,
+                'num_heads': 16,
+                'feedforward_channels': 4096
+            }
+            MViT(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        cfg['arch'] = {
+            'embed_dims': 96,
+            'num_layers': 10,
+            'num_heads': 1,
+            'downscale_indices': [2, 5, 8]
+        }
+        stage_indices = [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
+        model = MViT(**cfg)
+        self.assertEqual(model.embed_dims, 96)
+        self.assertEqual(model.num_layers, 10)
+        for i, block in enumerate(model.blocks):
+            stage = stage_indices[i]
+            self.assertEqual(block.out_dims, 96 * 2**(stage))
+
+        # Test out_indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_scales'] = {1: 1}
+        with self.assertRaisesRegex(AssertionError, "get <class 'dict'>"):
+            MViT(**cfg)
+        cfg['out_scales'] = [0, 13]
+        with self.assertRaisesRegex(AssertionError, 'Invalid out_scales 13'):
+            MViT(**cfg)
+
+        # Test model structure
+        cfg = deepcopy(self.cfg)
+        model = MViT(**cfg)
+        stage_indices = [0, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3]
+        self.assertEqual(len(model.blocks), 10)
+        dpr_inc = 0.1 / (10 - 1)
+        dpr = 0
+        for i, block in enumerate(model.blocks):
+            stage = stage_indices[i]
+            print(i, stage)
+            self.assertEqual(block.attn.num_heads, 2**stage)
+            if dpr > 0:
+                self.assertAlmostEqual(block.drop_path.drop_prob, dpr)
+            dpr += dpr_inc
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv3d',
+                mode='fan_in',
+                nonlinearity='linear')
+        ]
+        cfg['use_abs_pos_embed'] = True
+        model = MViT(**cfg)
+        ori_weight = model.patch_embed.projection.weight.clone().detach()
+        # The pos_embed is all zero before initialize
+        self.assertTrue(torch.allclose(model.pos_embed, torch.tensor(0.)))
+
+        model.init_weights()
+        initialized_weight = model.patch_embed.projection.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+        self.assertFalse(torch.allclose(model.pos_embed, torch.tensor(0.)))
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 6, 64, 64)
+
+        cfg = deepcopy(self.cfg)
+        model = MViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token, cls_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 768, 3, 2, 2))
+
+        # Test forward with multi out scales
+        cfg = deepcopy(self.cfg)
+        cfg['out_scales'] = (0, 1, 2, 3)
+        model = MViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 4)
+        for stage, out in enumerate(outs):
+            stride = 2**stage
+            patch_token, cls_token = out
+            self.assertEqual(patch_token.shape,
+                             (1, 96 * stride, 3, 16 // stride, 16 // stride))
+            self.assertEqual(cls_token.shape, (1, 96 * stride))
+
+        # Test forward with dynamic input size
+        imgs1 = torch.randn(1, 3, 2, 64, 64)
+        imgs2 = torch.randn(1, 3, 2, 96, 96)
+        imgs3 = torch.randn(1, 3, 2, 96, 128)
+        cfg = deepcopy(self.cfg)
+        model = MViT(**cfg)
+        for imgs in [imgs1, imgs2, imgs3]:
+            outs = model(imgs)
+            self.assertIsInstance(outs, tuple)
+            self.assertEqual(len(outs), 1)
+            patch_token, cls_token = outs[-1]
+            expect_feat_shape = (math.ceil(imgs.shape[2] / 2),
+                                 math.ceil(imgs.shape[3] / 32),
+                                 math.ceil(imgs.shape[4] / 32))
+            self.assertEqual(patch_token.shape, (1, 768, *expect_feat_shape))
+            self.assertEqual(cls_token.shape, (1, 768))
diff --git a/tests/models/backbones/test_resnet.py b/tests/models/backbones/test_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..dab195461fd74b72b8053cfc089105e5e793bd11
--- /dev/null
+++ b/tests/models/backbones/test_resnet.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn as nn
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.models import ResNet
+from mmaction.testing import check_norm_state, generate_backbone_demo_inputs
+
+
+def test_resnet_backbone():
+    """Test resnet backbone."""
+    with pytest.raises(KeyError):
+        # ResNet depth should be in [18, 34, 50, 101, 152]
+        ResNet(20)
+
+    with pytest.raises(AssertionError):
+        # In ResNet: 1 <= num_stages <= 4
+        ResNet(50, num_stages=0)
+
+    with pytest.raises(AssertionError):
+        # In ResNet: 1 <= num_stages <= 4
+        ResNet(50, num_stages=5)
+
+    with pytest.raises(AssertionError):
+        # len(strides) == len(dilations) == num_stages
+        ResNet(50, strides=(1, ), dilations=(1, 1), num_stages=3)
+
+    with pytest.raises(TypeError):
+        # pretrain must be a str
+        resnet50 = ResNet(50, pretrained=0)
+        resnet50.init_weights()
+
+    with pytest.raises(AssertionError):
+        # style must be in ['pytorch', 'caffe']
+        ResNet(18, style='tensorflow')
+
+    with pytest.raises(AssertionError):
+        # assert not with_cp
+        ResNet(18, with_cp=True)
+
+    # resnet with depth 18, norm_eval False, initial weights
+    resnet18 = ResNet(18)
+    resnet18.init_weights()
+
+    # resnet with depth 50, norm_eval True
+    resnet50 = ResNet(50, norm_eval=True)
+    resnet50.init_weights()
+    resnet50.train()
+    assert check_norm_state(resnet50.modules(), False)
+
+    # resnet with depth 50, norm_eval True, pretrained
+    resnet50_pretrain = ResNet(
+        pretrained='torchvision://resnet50', depth=50, norm_eval=True)
+    resnet50_pretrain.init_weights()
+    resnet50_pretrain.train()
+    assert check_norm_state(resnet50_pretrain.modules(), False)
+
+    # resnet with depth 50, norm_eval True, frozen_stages 1
+    frozen_stages = 1
+    resnet50_frozen = ResNet(50, frozen_stages=frozen_stages)
+    resnet50_frozen.init_weights()
+    resnet50_frozen.train()
+    assert resnet50_frozen.conv1.bn.training is False
+    for layer in resnet50_frozen.conv1.modules():
+        for param in layer.parameters():
+            assert param.requires_grad is False
+    for i in range(1, frozen_stages + 1):
+        layer = getattr(resnet50_frozen, f'layer{i}')
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # resnet with depth 50, partial batchnorm
+    resnet_pbn = ResNet(50, partial_bn=True)
+    resnet_pbn.train()
+    count_bn = 0
+    for m in resnet_pbn.modules():
+        if isinstance(m, nn.BatchNorm2d):
+            count_bn += 1
+            if count_bn >= 2:
+                assert m.weight.requires_grad is False
+                assert m.bias.requires_grad is False
+                assert m.training is False
+            else:
+                assert m.weight.requires_grad is True
+                assert m.bias.requires_grad is True
+                assert m.training is True
+
+    input_shape = (1, 3, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+
+    # resnet with depth 18 inference
+    resnet18 = ResNet(18, norm_eval=False)
+    resnet18.init_weights()
+    resnet18.train()
+    feat = resnet18(imgs)
+    assert feat.shape == torch.Size([1, 512, 2, 2])
+
+    # resnet with depth 50 inference
+    resnet50 = ResNet(50, norm_eval=False)
+    resnet50.init_weights()
+    resnet50.train()
+    feat = resnet50(imgs)
+    assert feat.shape == torch.Size([1, 2048, 2, 2])
+
+    # resnet with depth 50 in caffe style inference
+    resnet50_caffe = ResNet(50, style='caffe', norm_eval=False)
+    resnet50_caffe.init_weights()
+    resnet50_caffe.train()
+    feat = resnet50_caffe(imgs)
+    assert feat.shape == torch.Size([1, 2048, 2, 2])
+
+    resnet50_flow = ResNet(
+        depth=50, pretrained='torchvision://resnet50', in_channels=10)
+    input_shape = (1, 10, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    feat = resnet50_flow(imgs)
+    assert feat.shape == torch.Size([1, 2048, 2, 2])
+
+    resnet50 = ResNet(
+        depth=50, pretrained='torchvision://resnet50', in_channels=3)
+    input_shape = (1, 3, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    feat = resnet50(imgs)
+    assert feat.shape == torch.Size([1, 2048, 2, 2])
diff --git a/tests/models/backbones/test_resnet2plus1d.py b/tests/models/backbones/test_resnet2plus1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8af90d5a5ac2ab0af011111233ef2ecb541f1d82
--- /dev/null
+++ b/tests/models/backbones/test_resnet2plus1d.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.models import ResNet2Plus1d
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_resnet2plus1d_backbone():
+    # Test r2+1d backbone
+    with pytest.raises(AssertionError):
+        # r2+1d does not support inflation
+        ResNet2Plus1d(50, None, pretrained2d=True)
+
+    with pytest.raises(AssertionError):
+        # r2+1d requires conv(2+1)d module
+        ResNet2Plus1d(
+            50, None, pretrained2d=False, conv_cfg=dict(type='Conv3d'))
+
+    frozen_stages = 1
+    r2plus1d_34_frozen = ResNet2Plus1d(
+        34,
+        None,
+        conv_cfg=dict(type='Conv2plus1d'),
+        pretrained2d=False,
+        frozen_stages=frozen_stages,
+        conv1_kernel=(3, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(1, 1, 1, 1),
+        spatial_strides=(1, 2, 2, 2),
+        temporal_strides=(1, 2, 2, 2))
+    r2plus1d_34_frozen.init_weights()
+    r2plus1d_34_frozen.train()
+    assert r2plus1d_34_frozen.conv1.conv.bn_s.training is False
+    assert r2plus1d_34_frozen.conv1.bn.training is False
+    for param in r2plus1d_34_frozen.conv1.parameters():
+        assert param.requires_grad is False
+    for i in range(1, frozen_stages + 1):
+        layer = getattr(r2plus1d_34_frozen, f'layer{i}')
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+    input_shape = (1, 3, 8, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    # parrots 3dconv is only implemented on gpu
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            r2plus1d_34_frozen = r2plus1d_34_frozen.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = r2plus1d_34_frozen(imgs_gpu)
+            assert feat.shape == torch.Size([1, 512, 1, 2, 2])
+    else:
+        feat = r2plus1d_34_frozen(imgs)
+        assert feat.shape == torch.Size([1, 512, 1, 2, 2])
+
+    r2plus1d_50_frozen = ResNet2Plus1d(
+        50,
+        None,
+        conv_cfg=dict(type='Conv2plus1d'),
+        pretrained2d=False,
+        conv1_kernel=(3, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(1, 1, 1, 1),
+        spatial_strides=(1, 2, 2, 2),
+        temporal_strides=(1, 2, 2, 2),
+        frozen_stages=frozen_stages)
+    r2plus1d_50_frozen.init_weights()
+
+    r2plus1d_50_frozen.train()
+    assert r2plus1d_50_frozen.conv1.conv.bn_s.training is False
+    assert r2plus1d_50_frozen.conv1.bn.training is False
+    for param in r2plus1d_50_frozen.conv1.parameters():
+        assert param.requires_grad is False
+    for i in range(1, frozen_stages + 1):
+        layer = getattr(r2plus1d_50_frozen, f'layer{i}')
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+    input_shape = (1, 3, 8, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+
+    # parrots 3dconv is only implemented on gpu
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            r2plus1d_50_frozen = r2plus1d_50_frozen.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = r2plus1d_50_frozen(imgs_gpu)
+            assert feat.shape == torch.Size([1, 2048, 1, 2, 2])
+    else:
+        feat = r2plus1d_50_frozen(imgs)
+        assert feat.shape == torch.Size([1, 2048, 1, 2, 2])
diff --git a/tests/models/backbones/test_resnet3d.py b/tests/models/backbones/test_resnet3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..2467ac04236f7d3050083a770b9867d8abad0da0
--- /dev/null
+++ b/tests/models/backbones/test_resnet3d.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+import pytest
+import torch
+import torch.nn as nn
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.models import ResNet3d, ResNet3dLayer
+from mmaction.testing import check_norm_state, generate_backbone_demo_inputs
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_resnet3d_backbone():
+    """Test resnet3d backbone."""
+    with pytest.raises(AssertionError):
+        # In ResNet3d: 1 <= num_stages <= 4
+        ResNet3d(34, None, num_stages=0)
+
+    with pytest.raises(AssertionError):
+        # In ResNet3d: 1 <= num_stages <= 4
+        ResNet3d(34, None, num_stages=5)
+
+    with pytest.raises(AssertionError):
+        # In ResNet3d: 1 <= num_stages <= 4
+        ResNet3d(50, None, num_stages=0)
+
+    with pytest.raises(AssertionError):
+        # In ResNet3d: 1 <= num_stages <= 4
+        ResNet3d(50, None, num_stages=5)
+
+    with pytest.raises(AssertionError):
+        # len(spatial_strides) == len(temporal_strides)
+        # == len(dilations) == num_stages
+        ResNet3d(
+            50,
+            None,
+            spatial_strides=(1, ),
+            temporal_strides=(1, 1),
+            dilations=(1, 1, 1),
+            num_stages=4)
+
+    with pytest.raises(AssertionError):
+        # len(spatial_strides) == len(temporal_strides)
+        # == len(dilations) == num_stages
+        ResNet3d(
+            34,
+            None,
+            spatial_strides=(1, ),
+            temporal_strides=(1, 1),
+            dilations=(1, 1, 1),
+            num_stages=4)
+
+    with pytest.raises(TypeError):
+        # pretrain must be str or None.
+        resnet3d_34 = ResNet3d(34, ['resnet', 'bninception'])
+        resnet3d_34.init_weights()
+
+    with pytest.raises(TypeError):
+        # pretrain must be str or None.
+        resnet3d_50 = ResNet3d(50, ['resnet', 'bninception'])
+        resnet3d_50.init_weights()
+
+    # resnet3d with depth 34, no pretrained, norm_eval True
+    resnet3d_34 = ResNet3d(34, None, pretrained2d=False, norm_eval=True)
+    resnet3d_34.init_weights()
+    resnet3d_34.train()
+    assert check_norm_state(resnet3d_34.modules(), False)
+
+    # resnet3d with depth 50, no pretrained, norm_eval True
+    resnet3d_50 = ResNet3d(50, None, pretrained2d=False, norm_eval=True)
+    resnet3d_50.init_weights()
+    resnet3d_50.train()
+    assert check_norm_state(resnet3d_50.modules(), False)
+
+    # resnet3d with depth 50, pretrained2d, norm_eval True
+    resnet3d_50_pretrain = ResNet3d(
+        50, 'torchvision://resnet50', norm_eval=True)
+    resnet3d_50_pretrain.init_weights()
+    resnet3d_50_pretrain.train()
+    assert check_norm_state(resnet3d_50_pretrain.modules(), False)
+    from mmengine.runner.checkpoint import _load_checkpoint
+    chkp_2d = _load_checkpoint('torchvision://resnet50')
+    for name, module in resnet3d_50_pretrain.named_modules():
+        if len(name.split('.')) == 4:
+            # layer.block.module.submodule
+            prefix = name.split('.')[:2]
+            module_type = name.split('.')[2]
+            submodule_type = name.split('.')[3]
+
+            if module_type == 'downsample':
+                name2d = name.replace('conv', '0').replace('bn', '1')
+            else:
+                layer_id = name.split('.')[2][-1]
+                name2d = prefix[0] + '.' + prefix[1] + '.' + \
+                    submodule_type + layer_id
+
+            if isinstance(module, nn.Conv3d):
+                conv2d_weight = chkp_2d[name2d + '.weight']
+                conv3d_weight = getattr(module, 'weight').data
+                assert torch.equal(
+                    conv3d_weight,
+                    conv2d_weight.data.unsqueeze(2).expand_as(conv3d_weight) /
+                    conv3d_weight.shape[2])
+                if getattr(module, 'bias') is not None:
+                    conv2d_bias = chkp_2d[name2d + '.bias']
+                    conv3d_bias = getattr(module, 'bias').data
+                    assert torch.equal(conv2d_bias, conv3d_bias)
+
+            elif isinstance(module, nn.BatchNorm3d):
+                for pname in ['weight', 'bias', 'running_mean', 'running_var']:
+                    param_2d = chkp_2d[name2d + '.' + pname]
+                    param_3d = getattr(module, pname).data
+                assert torch.equal(param_2d, param_3d)
+
+    conv3d = resnet3d_50_pretrain.conv1.conv
+    assert torch.equal(
+        conv3d.weight,
+        chkp_2d['conv1.weight'].unsqueeze(2).expand_as(conv3d.weight) /
+        conv3d.weight.shape[2])
+    conv3d = resnet3d_50_pretrain.layer3[2].conv2.conv
+    assert torch.equal(
+        conv3d.weight, chkp_2d['layer3.2.conv2.weight'].unsqueeze(2).expand_as(
+            conv3d.weight) / conv3d.weight.shape[2])
+
+    # resnet3d with depth 34, no pretrained, norm_eval False
+    resnet3d_34_no_bn_eval = ResNet3d(
+        34, None, pretrained2d=False, norm_eval=False)
+    resnet3d_34_no_bn_eval.init_weights()
+    resnet3d_34_no_bn_eval.train()
+    assert check_norm_state(resnet3d_34_no_bn_eval.modules(), True)
+
+    # resnet3d with depth 50, no pretrained, norm_eval False
+    resnet3d_50_no_bn_eval = ResNet3d(
+        50, None, pretrained2d=False, norm_eval=False)
+    resnet3d_50_no_bn_eval.init_weights()
+    resnet3d_50_no_bn_eval.train()
+    assert check_norm_state(resnet3d_50_no_bn_eval.modules(), True)
+
+    # resnet3d with depth 34, no pretrained, frozen_stages, norm_eval False
+    frozen_stages = 1
+    resnet3d_34_frozen = ResNet3d(
+        34, None, pretrained2d=False, frozen_stages=frozen_stages)
+    resnet3d_34_frozen.init_weights()
+    resnet3d_34_frozen.train()
+    assert resnet3d_34_frozen.conv1.bn.training is False
+    for param in resnet3d_34_frozen.conv1.parameters():
+        assert param.requires_grad is False
+    for i in range(1, frozen_stages + 1):
+        layer = getattr(resnet3d_34_frozen, f'layer{i}')
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+    # test zero_init_residual
+    for m in resnet3d_34_frozen.modules():
+        if hasattr(m, 'conv2'):
+            assert torch.equal(m.conv2.bn.weight,
+                               torch.zeros_like(m.conv2.bn.weight))
+            assert torch.equal(m.conv2.bn.bias,
+                               torch.zeros_like(m.conv2.bn.bias))
+
+    # resnet3d with depth 50, no pretrained, frozen_stages, norm_eval False
+    frozen_stages = 1
+    resnet3d_50_frozen = ResNet3d(
+        50, None, pretrained2d=False, frozen_stages=frozen_stages)
+    resnet3d_50_frozen.init_weights()
+    resnet3d_50_frozen.train()
+    assert resnet3d_50_frozen.conv1.bn.training is False
+    for param in resnet3d_50_frozen.conv1.parameters():
+        assert param.requires_grad is False
+    for i in range(1, frozen_stages + 1):
+        layer = getattr(resnet3d_50_frozen, f'layer{i}')
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+    # test zero_init_residual
+    for m in resnet3d_50_frozen.modules():
+        if hasattr(m, 'conv3'):
+            assert torch.equal(m.conv3.bn.weight,
+                               torch.zeros_like(m.conv3.bn.weight))
+            assert torch.equal(m.conv3.bn.bias,
+                               torch.zeros_like(m.conv3.bn.bias))
+
+    # resnet3d frozen with depth 34 inference
+    input_shape = (1, 3, 6, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    # parrots 3dconv is only implemented on gpu
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            resnet3d_34_frozen = resnet3d_34_frozen.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = resnet3d_34_frozen(imgs_gpu)
+            assert feat.shape == torch.Size([1, 512, 3, 2, 2])
+    else:
+        feat = resnet3d_34_frozen(imgs)
+        assert feat.shape == torch.Size([1, 512, 3, 2, 2])
+
+    # resnet3d with depth 50 inference
+    input_shape = (1, 3, 6, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    # parrots 3dconv is only implemented on gpu
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            resnet3d_50_frozen = resnet3d_50_frozen.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = resnet3d_50_frozen(imgs_gpu)
+            assert feat.shape == torch.Size([1, 2048, 3, 2, 2])
+    else:
+        feat = resnet3d_50_frozen(imgs)
+        assert feat.shape == torch.Size([1, 2048, 3, 2, 2])
+
+    # resnet3d with depth 50 in caffe style inference
+    resnet3d_50_caffe = ResNet3d(50, None, pretrained2d=False, style='caffe')
+    resnet3d_50_caffe.init_weights()
+    resnet3d_50_caffe.train()
+
+    # parrots 3dconv is only implemented on gpu
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            resnet3d_50_caffe = resnet3d_50_caffe.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = resnet3d_50_caffe(imgs_gpu)
+            assert feat.shape == torch.Size([1, 2048, 3, 2, 2])
+    else:
+        feat = resnet3d_50_caffe(imgs)
+        assert feat.shape == torch.Size([1, 2048, 3, 2, 2])
+
+    # resnet3d with depth 34 in caffe style inference
+    resnet3d_34_caffe = ResNet3d(34, None, pretrained2d=False, style='caffe')
+    resnet3d_34_caffe.init_weights()
+    resnet3d_34_caffe.train()
+    # parrots 3dconv is only implemented on gpu
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            resnet3d_34_caffe = resnet3d_34_caffe.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = resnet3d_34_caffe(imgs_gpu)
+            assert feat.shape == torch.Size([1, 512, 3, 2, 2])
+    else:
+        feat = resnet3d_34_caffe(imgs)
+        assert feat.shape == torch.Size([1, 512, 3, 2, 2])
+
+    # resnet3d with depth with 3x3x3 inflate_style inference
+    resnet3d_50_1x1x1 = ResNet3d(
+        50, None, pretrained2d=False, inflate_style='3x3x3')
+    resnet3d_50_1x1x1.init_weights()
+    resnet3d_50_1x1x1.train()
+    # parrots 3dconv is only implemented on gpu
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            resnet3d_50_1x1x1 = resnet3d_50_1x1x1.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = resnet3d_50_1x1x1(imgs_gpu)
+            assert feat.shape == torch.Size([1, 2048, 3, 2, 2])
+    else:
+        feat = resnet3d_50_1x1x1(imgs)
+        assert feat.shape == torch.Size([1, 2048, 3, 2, 2])
+
+    resnet3d_34_1x1x1 = ResNet3d(
+        34, None, pretrained2d=False, inflate_style='3x3x3')
+    resnet3d_34_1x1x1.init_weights()
+    resnet3d_34_1x1x1.train()
+
+    # parrots 3dconv is only implemented on gpu
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            resnet3d_34_1x1x1 = resnet3d_34_1x1x1.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = resnet3d_34_1x1x1(imgs_gpu)
+            assert feat.shape == torch.Size([1, 512, 3, 2, 2])
+    else:
+        feat = resnet3d_34_1x1x1(imgs)
+        assert feat.shape == torch.Size([1, 512, 3, 2, 2])
+
+    # resnet3d with non-local module
+    non_local_cfg = dict(
+        sub_sample=True,
+        use_scale=False,
+        norm_cfg=dict(type='BN3d', requires_grad=True),
+        mode='embedded_gaussian')
+    non_local = ((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0))
+    resnet3d_nonlocal = ResNet3d(
+        50,
+        None,
+        pretrained2d=False,
+        non_local=non_local,
+        non_local_cfg=non_local_cfg)
+    resnet3d_nonlocal.init_weights()
+    for layer_name in ['layer2', 'layer3']:
+        layer = getattr(resnet3d_nonlocal, layer_name)
+        for i, _ in enumerate(layer):
+            if i % 2 == 0:
+                assert hasattr(layer[i], 'non_local_block')
+
+    feat = resnet3d_nonlocal(imgs)
+    assert feat.shape == torch.Size([1, 2048, 3, 2, 2])
+
+
+def test_resnet3d_layer():
+    with pytest.raises(AssertionError):
+        ResNet3dLayer(22, None)
+
+    with pytest.raises(AssertionError):
+        ResNet3dLayer(50, None, stage=4)
+
+    res_layer = ResNet3dLayer(50, None, stage=3, norm_eval=True)
+    res_layer.init_weights()
+    res_layer.train()
+    input_shape = (1, 1024, 1, 4, 4)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            res_layer = res_layer.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = res_layer(imgs_gpu)
+            assert feat.shape == torch.Size([1, 2048, 1, 2, 2])
+    else:
+        feat = res_layer(imgs)
+        assert feat.shape == torch.Size([1, 2048, 1, 2, 2])
+
+    res_layer = ResNet3dLayer(
+        50, 'torchvision://resnet50', stage=3, all_frozen=True)
+    res_layer.init_weights()
+    res_layer.train()
+    imgs = generate_backbone_demo_inputs(input_shape)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            res_layer = res_layer.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = res_layer(imgs_gpu)
+            assert feat.shape == torch.Size([1, 2048, 1, 2, 2])
+    else:
+        feat = res_layer(imgs)
+        assert feat.shape == torch.Size([1, 2048, 1, 2, 2])
diff --git a/tests/models/backbones/test_resnet3d_csn.py b/tests/models/backbones/test_resnet3d_csn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab4da7ac45eb59847520ac916c1143e526bbb15b
--- /dev/null
+++ b/tests/models/backbones/test_resnet3d_csn.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn as nn
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.models import ResNet3dCSN
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_resnet_csn_backbone():
+    """Test resnet_csn backbone."""
+    with pytest.raises(ValueError):
+        # Bottleneck mode must be "ip" or "ir"
+        ResNet3dCSN(152, None, bottleneck_mode='id')
+
+    input_shape = (2, 3, 6, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+
+    resnet3d_csn_frozen = ResNet3dCSN(
+        152, None, bn_frozen=True, norm_eval=True)
+    resnet3d_csn_frozen.train()
+    for m in resnet3d_csn_frozen.modules():
+        if isinstance(m, _BatchNorm):
+            for param in m.parameters():
+                assert param.requires_grad is False
+
+    # Interaction-preserved channel-separated bottleneck block
+    resnet3d_csn_ip = ResNet3dCSN(152, None, bottleneck_mode='ip')
+    resnet3d_csn_ip.init_weights()
+    resnet3d_csn_ip.train()
+    for i, layer_name in enumerate(resnet3d_csn_ip.res_layers):
+        layers = getattr(resnet3d_csn_ip, layer_name)
+        num_blocks = resnet3d_csn_ip.stage_blocks[i]
+        assert len(layers) == num_blocks
+        for layer in layers:
+            assert isinstance(layer.conv2, nn.Sequential)
+            assert len(layer.conv2) == 2
+            assert layer.conv2[1].groups == layer.planes
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            resnet3d_csn_ip = resnet3d_csn_ip.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = resnet3d_csn_ip(imgs_gpu)
+            assert feat.shape == torch.Size([2, 2048, 1, 2, 2])
+    else:
+        feat = resnet3d_csn_ip(imgs)
+        assert feat.shape == torch.Size([2, 2048, 1, 2, 2])
+
+    # Interaction-reduced channel-separated bottleneck block
+    resnet3d_csn_ir = ResNet3dCSN(152, None, bottleneck_mode='ir')
+    resnet3d_csn_ir.init_weights()
+    resnet3d_csn_ir.train()
+    for i, layer_name in enumerate(resnet3d_csn_ir.res_layers):
+        layers = getattr(resnet3d_csn_ir, layer_name)
+        num_blocks = resnet3d_csn_ir.stage_blocks[i]
+        assert len(layers) == num_blocks
+        for layer in layers:
+            assert isinstance(layer.conv2, nn.Sequential)
+            assert len(layer.conv2) == 1
+            assert layer.conv2[0].groups == layer.planes
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            resnet3d_csn_ir = resnet3d_csn_ir.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = resnet3d_csn_ir(imgs_gpu)
+            assert feat.shape == torch.Size([2, 2048, 1, 2, 2])
+    else:
+        feat = resnet3d_csn_ir(imgs)
+        assert feat.shape == torch.Size([2, 2048, 1, 2, 2])
+
+    # Set training status = False
+    resnet3d_csn_ip = ResNet3dCSN(152, None, bottleneck_mode='ip')
+    resnet3d_csn_ip.init_weights()
+    resnet3d_csn_ip.train(False)
+    for module in resnet3d_csn_ip.children():
+        assert module.training is False
diff --git a/tests/models/backbones/test_resnet3d_slowfast.py b/tests/models/backbones/test_resnet3d_slowfast.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d8fbc38eb2b069a27840fce17bd54364af7d46d
--- /dev/null
+++ b/tests/models/backbones/test_resnet3d_slowfast.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.models import ResNet3dSlowFast
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_slowfast_backbone():
+    """Test SlowFast backbone."""
+    with pytest.raises(TypeError):
+        # cfg should be a dict
+        ResNet3dSlowFast(slow_pathway=list(['foo', 'bar']))
+    with pytest.raises(KeyError):
+        # pathway type should be implemented
+        ResNet3dSlowFast(slow_pathway=dict(type='resnext'))
+
+    # test slowfast with slow inflated
+    sf_50_inflate = ResNet3dSlowFast(
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained='torchvision://resnet50',
+            pretrained2d=True,
+            lateral=True,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1)))
+    sf_50_inflate.init_weights()
+    sf_50_inflate.train()
+
+    # test slowfast with no lateral connection
+    sf_50_wo_lateral = ResNet3dSlowFast(
+        None,
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=False,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1)))
+    sf_50_wo_lateral.init_weights()
+    sf_50_wo_lateral.train()
+
+    # slowfast w/o lateral connection inference test
+    input_shape = (1, 3, 8, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    feat = sf_50_wo_lateral(imgs)
+
+    assert isinstance(feat, tuple)
+    assert feat[0].shape == torch.Size([1, 2048, 1, 2, 2])
+    assert feat[1].shape == torch.Size([1, 256, 8, 2, 2])
+
+    # test slowfast with frozen stages config
+    frozen_slow = 3
+    sf_50 = ResNet3dSlowFast(
+        None,
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            pretrained2d=True,
+            lateral=True,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1),
+            frozen_stages=frozen_slow))
+    sf_50.init_weights()
+    sf_50.train()
+
+    for stage in range(1, sf_50.slow_path.num_stages):
+        lateral_name = sf_50.slow_path.lateral_connections[stage - 1]
+        conv_lateral = getattr(sf_50.slow_path, lateral_name)
+        for mod in conv_lateral.modules():
+            if isinstance(mod, _BatchNorm):
+                if stage <= frozen_slow:
+                    assert mod.training is False
+                else:
+                    assert mod.training is True
+        for param in conv_lateral.parameters():
+            if stage <= frozen_slow:
+                assert param.requires_grad is False
+            else:
+                assert param.requires_grad is True
+
+    # test slowfast with normal config
+    sf_50 = ResNet3dSlowFast()
+    sf_50.init_weights()
+    sf_50.train()
+
+    # slowfast inference test
+    input_shape = (1, 3, 8, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    feat = sf_50(imgs)
+
+    assert isinstance(feat, tuple)
+    assert feat[0].shape == torch.Size([1, 2048, 1, 2, 2])
+    assert feat[1].shape == torch.Size([1, 256, 8, 2, 2])
diff --git a/tests/models/backbones/test_resnet3d_slowonly.py b/tests/models/backbones/test_resnet3d_slowonly.py
new file mode 100644
index 0000000000000000000000000000000000000000..7557edbd2765c0d4cd2c10227ea3db2e4cefb4b3
--- /dev/null
+++ b/tests/models/backbones/test_resnet3d_slowonly.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models import ResNet3dSlowOnly
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_slowonly_backbone():
+    """Test SlowOnly backbone."""
+    with pytest.raises(AssertionError):
+        # SlowOnly should contain no lateral connection
+        ResNet3dSlowOnly(depth=50, pretrained=None, lateral=True)
+
+    # test SlowOnly for PoseC3D
+    so_50 = ResNet3dSlowOnly(
+        depth=50,
+        pretrained=None,
+        in_channels=17,
+        base_channels=32,
+        num_stages=3,
+        out_indices=(2, ),
+        stage_blocks=(4, 6, 3),
+        conv1_stride_s=1,
+        pool1_stride_s=1,
+        inflate=(0, 1, 1),
+        spatial_strides=(2, 2, 2),
+        temporal_strides=(1, 1, 2),
+        dilations=(1, 1, 1))
+    so_50.init_weights()
+    so_50.train()
+
+    # test SlowOnly with normal config
+    so_50 = ResNet3dSlowOnly(depth=50, pretrained=None)
+    so_50.init_weights()
+    so_50.train()
+
+    # SlowOnly inference test
+    input_shape = (1, 3, 8, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    # parrots 3dconv is only implemented on gpu
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            so_50 = so_50.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = so_50(imgs_gpu)
+    else:
+        feat = so_50(imgs)
+    assert feat.shape == torch.Size([1, 2048, 8, 2, 2])
diff --git a/tests/models/backbones/test_resnet_audio.py b/tests/models/backbones/test_resnet_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..826ba3f67cfd8f7a412df3c0456c7f98a91f1bd5
--- /dev/null
+++ b/tests/models/backbones/test_resnet_audio.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models import ResNetAudio
+from mmaction.testing import generate_backbone_demo_inputs
+from mmaction.utils import register_all_modules
+
+
+def test_resnet_audio_backbone():
+    """Test ResNetAudio backbone."""
+    input_shape = (1, 1, 16, 16)
+    spec = generate_backbone_demo_inputs(input_shape)
+    # inference
+    register_all_modules()
+    audioonly = ResNetAudio(50, None)
+    audioonly.init_weights()
+    audioonly.train()
+    feat = audioonly(spec)
+    assert feat.shape == torch.Size([1, 1024, 2, 2])
diff --git a/tests/models/backbones/test_resnet_omni.py b/tests/models/backbones/test_resnet_omni.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f5da14046d3d68588eea1ab0fe8e70cc02f027
--- /dev/null
+++ b/tests/models/backbones/test_resnet_omni.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torchvision
+
+from mmaction.models import OmniResNet
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_x3d_backbone():
+    """Test x3d backbone."""
+    _ = OmniResNet()
+
+    resnet50 = torchvision.models.resnet50()
+    params = resnet50.state_dict()
+    torch.save(params, './r50.pth')
+    model = OmniResNet(pretrain_2d='./r50.pth')
+
+    input_shape = (2, 3, 8, 64, 64)
+    videos = generate_backbone_demo_inputs(input_shape)
+    feat = model(videos)
+    assert feat.shape == torch.Size([2, 2048, 8, 2, 2])
+
+    input_shape = (2, 3, 64, 64)
+    images = generate_backbone_demo_inputs(input_shape)
+    feat = model(images)
+    assert feat.shape == torch.Size([2, 2048, 2, 2])
diff --git a/tests/models/backbones/test_resnet_tin.py b/tests/models/backbones/test_resnet_tin.py
new file mode 100644
index 0000000000000000000000000000000000000000..26f0aab13d463dceb09af09db4a5ecced94d525e
--- /dev/null
+++ b/tests/models/backbones/test_resnet_tin.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn as nn
+
+from mmaction.models import ResNetTIN
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_resnet_tin_backbone():
+    """Test resnet_tin backbone."""
+    with pytest.raises(AssertionError):
+        # num_segments should be positive
+        resnet_tin = ResNetTIN(50, num_segments=-1)
+        resnet_tin.init_weights()
+
+    from mmaction.models.backbones.resnet_tin import (CombineNet,
+                                                      TemporalInterlace)
+
+    # resnet_tin with normal config
+    resnet_tin = ResNetTIN(50)
+    resnet_tin.init_weights()
+    for layer_name in resnet_tin.res_layers:
+        layer = getattr(resnet_tin, layer_name)
+        blocks = list(layer.children())
+        for block in blocks:
+            assert isinstance(block.conv1.conv, CombineNet)
+            assert isinstance(block.conv1.conv.net1, TemporalInterlace)
+            assert (
+                block.conv1.conv.net1.num_segments == resnet_tin.num_segments)
+            assert block.conv1.conv.net1.shift_div == resnet_tin.shift_div
+
+    # resnet_tin with partial batchnorm
+    resnet_tin_pbn = ResNetTIN(50, partial_bn=True)
+    resnet_tin_pbn.train()
+    count_bn = 0
+    for m in resnet_tin_pbn.modules():
+        if isinstance(m, nn.BatchNorm2d):
+            count_bn += 1
+            if count_bn >= 2:
+                assert m.training is False
+                assert m.weight.requires_grad is False
+                assert m.bias.requires_grad is False
+            else:
+                assert m.training is True
+                assert m.weight.requires_grad is True
+                assert m.bias.requires_grad is True
+
+    input_shape = (8, 3, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape).cuda()
+    resnet_tin = resnet_tin.cuda()
+
+    # resnet_tin with normal cfg inference
+    feat = resnet_tin(imgs)
+    assert feat.shape == torch.Size([8, 2048, 2, 2])
diff --git a/tests/models/backbones/test_resnet_tsm.py b/tests/models/backbones/test_resnet_tsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f852df43eaae48a70fa61825a95c905f21f1f39
--- /dev/null
+++ b/tests/models/backbones/test_resnet_tsm.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from unittest import TestCase
+
+import pytest
+import torch
+import torch.nn as nn
+
+from mmaction.models import ResNetTSM
+from mmaction.models.backbones.resnet import Bottleneck
+from mmaction.models.backbones.resnet_tsm import NL3DWrapper, TemporalShift
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+class Test_ResNet_TSM(TestCase):
+
+    def setUp(self):
+        input_shape = (8, 3, 64, 64)
+        self.imgs = generate_backbone_demo_inputs(input_shape)
+
+    def test_init(self):
+        with pytest.raises(NotImplementedError):
+            # shift_place must be block or blockres
+            resnet_tsm_50_block = ResNetTSM(50, shift_place='Block')
+            resnet_tsm_50_block.init_weights()
+
+    def test_init_from_scratch(self):
+        resnet_tsm_50 = ResNetTSM(50, pretrained=None, pretrained2d=False)
+        resnet_tsm_50.init_weights()
+
+    def test_resnet_tsm_temporal_shift_blockres(self):
+        # resnet_tsm with depth 50
+        resnet_tsm_50 = ResNetTSM(50, pretrained='torchvision://resnet50')
+        resnet_tsm_50.init_weights()
+        for layer_name in resnet_tsm_50.res_layers:
+            layer = getattr(resnet_tsm_50, layer_name)
+            blocks = list(layer.children())
+            for block in blocks:
+                assert isinstance(block.conv1.conv, TemporalShift)
+                assert block.conv1.conv.num_segments == resnet_tsm_50.num_segments  # noqa: E501
+                assert block.conv1.conv.shift_div == resnet_tsm_50.shift_div
+                assert isinstance(block.conv1.conv.net, nn.Conv2d)
+        feat = resnet_tsm_50(self.imgs)
+        assert feat.shape == torch.Size([8, 2048, 2, 2])
+
+    def test_resnet_tsm_temporal_shift_block(self):
+        # resnet_tsm with depth 50, no pretrained, shift_place is block
+        resnet_tsm_50_block = ResNetTSM(
+            50, shift_place='block', pretrained='torchvision://resnet50')
+        resnet_tsm_50_block.init_weights()
+        for layer_name in resnet_tsm_50_block.res_layers:
+            layer = getattr(resnet_tsm_50_block, layer_name)
+            blocks = list(layer.children())
+            for block in blocks:
+                assert isinstance(block, TemporalShift)
+                assert block.num_segments == resnet_tsm_50_block.num_segments
+                assert block.num_segments == resnet_tsm_50_block.num_segments
+                assert block.shift_div == resnet_tsm_50_block.shift_div
+                assert isinstance(block.net, Bottleneck)
+
+    def test_resnet_tsm_temporal_pool(self):
+        # resnet_tsm with depth 50, no pretrained, use temporal_pool
+        resnet_tsm_50_temporal_pool = ResNetTSM(
+            50, temporal_pool=True, pretrained='torchvision://resnet50')
+        resnet_tsm_50_temporal_pool.init_weights()
+        for layer_name in resnet_tsm_50_temporal_pool.res_layers:
+            layer = getattr(resnet_tsm_50_temporal_pool, layer_name)
+            blocks = list(layer.children())
+
+            if layer_name == 'layer2':
+                assert len(blocks) == 2
+                assert isinstance(blocks[1], nn.MaxPool3d)
+                blocks = copy.deepcopy(blocks[0])
+
+            for block in blocks:
+                assert isinstance(block.conv1.conv, TemporalShift)
+                if layer_name == 'layer1':
+                    assert block.conv1.conv.num_segments == \
+                        resnet_tsm_50_temporal_pool.num_segments
+                else:
+                    assert block.conv1.conv.num_segments == \
+                        resnet_tsm_50_temporal_pool.num_segments // 2
+                assert block.conv1.conv.shift_div == resnet_tsm_50_temporal_pool.shift_div  # noqa: E501
+                assert isinstance(block.conv1.conv.net, nn.Conv2d)
+
+        feat = resnet_tsm_50_temporal_pool(self.imgs)
+        assert feat.shape == torch.Size([4, 2048, 2, 2])
+
+    def test_resnet_tsm_non_local(self):
+        # resnet_tsm with non-local module
+        non_local_cfg = dict(
+            sub_sample=True,
+            use_scale=False,
+            norm_cfg=dict(type='BN3d', requires_grad=True),
+            mode='embedded_gaussian')
+        non_local = ((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0))
+        resnet_tsm_nonlocal = ResNetTSM(
+            50,
+            non_local=non_local,
+            non_local_cfg=non_local_cfg,
+            pretrained='torchvision://resnet50')
+        resnet_tsm_nonlocal.init_weights()
+        for layer_name in ['layer2', 'layer3']:
+            layer = getattr(resnet_tsm_nonlocal, layer_name)
+            for i, _ in enumerate(layer):
+                if i % 2 == 0:
+                    assert isinstance(layer[i], NL3DWrapper)
+
+        feat = resnet_tsm_nonlocal(self.imgs)
+        assert feat.shape == torch.Size([8, 2048, 2, 2])
+
+    def test_resnet_tsm_full(self):
+        non_local_cfg = dict(
+            sub_sample=True,
+            use_scale=False,
+            norm_cfg=dict(type='BN3d', requires_grad=True),
+            mode='embedded_gaussian')
+        non_local = ((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0))
+        resnet_tsm_50_full = ResNetTSM(
+            50,
+            pretrained='torchvision://resnet50',
+            non_local=non_local,
+            non_local_cfg=non_local_cfg,
+            temporal_pool=True)
+        resnet_tsm_50_full.init_weights()
+
+        input_shape = (16, 3, 32, 32)
+        imgs = generate_backbone_demo_inputs(input_shape)
+        feat = resnet_tsm_50_full(imgs)
+        assert feat.shape == torch.Size([8, 2048, 1, 1])
diff --git a/tests/models/backbones/test_rgbposeconv3d.py b/tests/models/backbones/test_rgbposeconv3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c44ed5d8db28fabdc24109d9bb5d0da217f0298
--- /dev/null
+++ b/tests/models/backbones/test_rgbposeconv3d.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models import RGBPoseConv3D
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_rgbposeconv3d():
+    """Test RGBPoseConv3D backbone."""
+
+    with pytest.raises(AssertionError):
+        RGBPoseConv3D(pose_drop_path=1.1, rgb_drop_path=1.1)
+
+    rgbposec3d = RGBPoseConv3D()
+    rgbposec3d.init_weights()
+    rgbposec3d.train()
+
+    imgs_shape = (1, 3, 8, 224, 224)
+    heatmap_imgs_shape = (1, 17, 32, 56, 56)
+    imgs = generate_backbone_demo_inputs(imgs_shape)
+    heatmap_imgs = generate_backbone_demo_inputs(heatmap_imgs_shape)
+
+    (x_rgb, x_pose) = rgbposec3d(imgs, heatmap_imgs)
+
+    assert x_rgb.shape == torch.Size([1, 2048, 8, 7, 7])
+    assert x_pose.shape == torch.Size([1, 512, 32, 7, 7])
diff --git a/tests/models/backbones/test_stgcn.py b/tests/models/backbones/test_stgcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..45bcb4bb54c1fd4b2fcd028c6fd1c1f3a869808d
--- /dev/null
+++ b/tests/models/backbones/test_stgcn.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models import STGCN
+
+
+def test_stgcn_backbone():
+    """Test STGCN backbone."""
+
+    mode = 'stgcn_spatial'
+    batch_size, num_person, num_frames = 2, 2, 150
+
+    # openpose-18 layout
+    num_joints = 18
+    model = STGCN(graph_cfg=dict(layout='openpose', mode=mode))
+    model.init_weights()
+    inputs = torch.randn(batch_size, num_person, num_frames, num_joints, 3)
+    output = model(inputs)
+    assert output.shape == torch.Size([2, 2, 256, 38, 18])
+
+    # nturgb+d layout
+    num_joints = 25
+    model = STGCN(graph_cfg=dict(layout='nturgb+d', mode=mode))
+    model.init_weights()
+    inputs = torch.randn(batch_size, num_person, num_frames, num_joints, 3)
+    output = model(inputs)
+    assert output.shape == torch.Size([2, 2, 256, 38, 25])
+
+    # coco layout
+    num_joints = 17
+    model = STGCN(graph_cfg=dict(layout='coco', mode=mode))
+    model.init_weights()
+    inputs = torch.randn(batch_size, num_person, num_frames, num_joints, 3)
+    output = model(inputs)
+    assert output.shape == torch.Size([2, 2, 256, 38, 17])
+
+    # custom settings
+    # instantiate STGCN++
+    model = STGCN(
+        graph_cfg=dict(layout='coco', mode='spatial'),
+        gcn_adaptive='init',
+        gcn_with_res=True,
+        tcn_type='mstcn')
+    model.init_weights()
+    output = model(inputs)
+    assert output.shape == torch.Size([2, 2, 256, 38, 17])
diff --git a/tests/models/backbones/test_swin.py b/tests/models/backbones/test_swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..be3921f1361ffe0428b814e7f41ad47705c3c3b4
--- /dev/null
+++ b/tests/models/backbones/test_swin.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models import SwinTransformer3D
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_swin_backbone():
+    """Test swin backbone."""
+    with pytest.raises(AssertionError):
+        SwinTransformer3D(arch='-t')
+
+    with pytest.raises(AssertionError):
+        SwinTransformer3D(arch={'embed_dims': 96})
+
+    with pytest.raises(AssertionError):
+        SwinTransformer3D(arch={
+            'embed_dims': 96,
+            'depths': [2, 2, 6],
+            'num_heads': [3, 6, 12, 24]
+        })
+
+    with pytest.raises(AssertionError):
+        SwinTransformer3D(
+            arch={
+                'embed_dims': 96,
+                'depths': [2, 2, 6, 2, 2],
+                'num_heads': [3, 6, 12, 24, 48]
+            })
+
+    with pytest.raises(AssertionError):
+        SwinTransformer3D(arch='t', out_indices=(4, ))
+
+    with pytest.raises(TypeError):
+        swin_t = SwinTransformer3D(arch='t', pretrained=[0, 1, 1])
+        swin_t.init_weights()
+
+    with pytest.raises(TypeError):
+        swin_t = SwinTransformer3D(arch='t')
+        swin_t.init_weights(pretrained=[0, 1, 1])
+
+    swin_b = SwinTransformer3D(arch='b', pretrained=None, pretrained2d=False)
+    swin_b.init_weights()
+    swin_b.train()
+
+    pretrained_url = 'https://download.openmmlab.com/mmaction/v1.0/' \
+                     'recognition/swin/swin_tiny_patch4_window7_224.pth'
+
+    swin_t_pre = SwinTransformer3D(
+        arch='t', pretrained=pretrained_url, pretrained2d=True)
+    swin_t_pre.init_weights()
+    swin_t_pre.train()
+
+    from mmengine.runner.checkpoint import _load_checkpoint
+    ckpt_2d = _load_checkpoint(pretrained_url, map_location='cpu')
+    state_dict = ckpt_2d['model']
+
+    patch_embed_weight2d = state_dict['patch_embed.proj.weight'].data
+    patch_embed_weight3d = swin_t_pre.patch_embed.proj.weight.data
+    assert torch.equal(
+        patch_embed_weight3d,
+        patch_embed_weight2d.unsqueeze(2).expand_as(patch_embed_weight3d) /
+        patch_embed_weight3d.shape[2])
+
+    norm = swin_t_pre.norm3
+    assert torch.equal(norm.weight.data, state_dict['norm.weight'])
+    assert torch.equal(norm.bias.data, state_dict['norm.bias'])
+
+    for name, param in swin_t_pre.named_parameters():
+        if 'relative_position_bias_table' in name:
+            bias2d = state_dict[name]
+            assert torch.equal(
+                param.data, bias2d.repeat(2 * swin_t_pre.window_size[0] - 1,
+                                          1))
+
+    frozen_stages = 1
+    swin_t_frozen = SwinTransformer3D(
+        arch='t',
+        pretrained=None,
+        pretrained2d=False,
+        frozen_stages=frozen_stages)
+    swin_t_frozen.init_weights()
+    swin_t_frozen.train()
+    for param in swin_t_frozen.patch_embed.parameters():
+        assert param.requires_grad is False
+    for i in range(frozen_stages):
+        layer = swin_t_frozen.layers[i]
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    input_shape = (1, 3, 6, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+
+    feat = swin_t_frozen(imgs)
+    assert feat.shape == torch.Size([1, 768, 3, 2, 2])
+
+    input_shape = (1, 3, 5, 63, 63)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    feat = swin_t_frozen(imgs)
+    assert feat.shape == torch.Size([1, 768, 3, 2, 2])
+
+    swin_t_all_stages = SwinTransformer3D(arch='t', out_indices=(0, 1, 2, 3))
+    feats = swin_t_all_stages(imgs)
+    assert feats[0].shape == torch.Size([1, 96, 3, 16, 16])
+    assert feats[1].shape == torch.Size([1, 192, 3, 8, 8])
+    assert feats[2].shape == torch.Size([1, 384, 3, 4, 4])
+    assert feats[3].shape == torch.Size([1, 768, 3, 2, 2])
+
+    swin_t_all_stages_after_ds = SwinTransformer3D(
+        arch='t', out_indices=(0, 1, 2, 3), out_after_downsample=True)
+    feats = swin_t_all_stages_after_ds(imgs)
+    assert feats[0].shape == torch.Size([1, 192, 3, 8, 8])
+    assert feats[1].shape == torch.Size([1, 384, 3, 4, 4])
+    assert feats[2].shape == torch.Size([1, 768, 3, 2, 2])
+    assert feats[3].shape == torch.Size([1, 768, 3, 2, 2])
diff --git a/tests/models/backbones/test_tanet.py b/tests/models/backbones/test_tanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..82b74428fc49a1ad243d31bafe83c1f64a99467e
--- /dev/null
+++ b/tests/models/backbones/test_tanet.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models import TANet
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_tanet_backbone():
+    """Test tanet backbone."""
+    with pytest.raises(NotImplementedError):
+        # TA-Blocks are only based on Bottleneck block now
+        tanet_18 = TANet(18, 8)
+        tanet_18.init_weights()
+
+    from mmaction.models.backbones.resnet import Bottleneck
+    from mmaction.models.backbones.tanet import TABlock
+
+    # tanet with depth 50
+    tanet_50 = TANet(50, 8)
+    tanet_50.init_weights()
+
+    for layer_name in tanet_50.res_layers:
+        layer = getattr(tanet_50, layer_name)
+        blocks = list(layer.children())
+        for block in blocks:
+            assert isinstance(block, TABlock)
+            assert isinstance(block.block, Bottleneck)
+            assert block.tam.num_segments == block.num_segments
+            assert block.tam.in_channels == block.block.conv1.out_channels
+
+    input_shape = (8, 3, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    feat = tanet_50(imgs)
+    assert feat.shape == torch.Size([8, 2048, 2, 2])
+
+    input_shape = (16, 3, 32, 32)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    feat = tanet_50(imgs)
+    assert feat.shape == torch.Size([16, 2048, 1, 1])
diff --git a/tests/models/backbones/test_timesformer.py b/tests/models/backbones/test_timesformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..81843e08483502dac2eaa67ab1a722cce354203b
--- /dev/null
+++ b/tests/models/backbones/test_timesformer.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models import TimeSformer
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_timesformer_backbone():
+    input_shape = (1, 3, 8, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+
+    # divided_space_time
+    timesformer = TimeSformer(
+        8, 64, 16, embed_dims=768, attention_type='divided_space_time')
+    timesformer.init_weights()
+    from mmaction.models.common import (DividedSpatialAttentionWithNorm,
+                                        DividedTemporalAttentionWithNorm,
+                                        FFNWithNorm)
+    assert isinstance(timesformer.transformer_layers.layers[0].attentions[0],
+                      DividedTemporalAttentionWithNorm)
+    assert isinstance(timesformer.transformer_layers.layers[11].attentions[1],
+                      DividedSpatialAttentionWithNorm)
+    assert isinstance(timesformer.transformer_layers.layers[0].ffns[0],
+                      FFNWithNorm)
+    assert hasattr(timesformer, 'time_embed')
+    assert timesformer.patch_embed.num_patches == 16
+
+    cls_tokens = timesformer(imgs)
+    assert cls_tokens.shape == torch.Size([1, 768])
+
+    # space_only
+    timesformer = TimeSformer(
+        8, 64, 16, embed_dims=512, num_heads=8, attention_type='space_only')
+    timesformer.init_weights()
+
+    assert not hasattr(timesformer, 'time_embed')
+    assert timesformer.patch_embed.num_patches == 16
+
+    cls_tokens = timesformer(imgs)
+    assert cls_tokens.shape == torch.Size([1, 512])
+
+    # joint_space_time
+    input_shape = (1, 3, 2, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    timesformer = TimeSformer(
+        2,
+        64,
+        8,
+        embed_dims=256,
+        num_heads=8,
+        attention_type='joint_space_time')
+    timesformer.init_weights()
+
+    assert hasattr(timesformer, 'time_embed')
+    assert timesformer.patch_embed.num_patches == 64
+
+    cls_tokens = timesformer(imgs)
+    assert cls_tokens.shape == torch.Size([1, 256])
+
+    with pytest.raises(AssertionError):
+        # unsupported attention type
+        timesformer = TimeSformer(
+            8, 64, 16, attention_type='wrong_attention_type')
+
+    with pytest.raises(AssertionError):
+        # Wrong transformer_layers type
+        timesformer = TimeSformer(8, 64, 16, transformer_layers='wrong_type')
diff --git a/tests/models/backbones/test_uniformer.py b/tests/models/backbones/test_uniformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb3c8d173aed930f972ea6f430de1792d03ab473
--- /dev/null
+++ b/tests/models/backbones/test_uniformer.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models import UniFormer
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_uniformer_backbone():
+    """Test uniformer backbone."""
+    input_shape = (1, 3, 16, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+
+    model = UniFormer(
+        depth=[3, 4, 8, 3],
+        embed_dim=[64, 128, 320, 512],
+        head_dim=64,
+        drop_path_rate=0.1)
+    model.init_weights()
+
+    model.eval()
+    assert model(imgs).shape == torch.Size([1, 512, 8, 2, 2])
diff --git a/tests/models/backbones/test_uniformerv2.py b/tests/models/backbones/test_uniformerv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..be4357bb026383db8e0f8aa30146e217eec6cafa
--- /dev/null
+++ b/tests/models/backbones/test_uniformerv2.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models import UniFormerV2
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_uniformerv2_backbone():
+    """Test uniformer backbone."""
+    input_shape = (1, 3, 8, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+
+    model = UniFormerV2(
+        input_resolution=64,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=8,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        clip_pretrained=False,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5])
+    model.init_weights()
+
+    model.eval()
+    assert model(imgs).shape == torch.Size([1, 768])
+
+    # SthSth
+    input_shape = (1, 3, 16, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+
+    model = UniFormerV2(
+        input_resolution=64,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=16,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=True,
+        no_lmhra=False,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        clip_pretrained=False,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5])
+    model.init_weights()
+
+    model.eval()
+    assert model(imgs).shape == torch.Size([1, 768])
diff --git a/tests/models/backbones/test_vit_mae.py b/tests/models/backbones/test_vit_mae.py
new file mode 100644
index 0000000000000000000000000000000000000000..da184c675d588a46020a4cd8b698e9bee3d97bfa
--- /dev/null
+++ b/tests/models/backbones/test_vit_mae.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models import VisionTransformer
+
+
+def test_vit_backbone():
+    """Test vit backbone."""
+    x = torch.randn(1, 3, 8, 64, 64)
+    model = VisionTransformer(
+        img_size=64,
+        num_frames=8,
+        qkv_bias=True,
+        drop_path_rate=0.2,
+        init_values=0.1)
+    model.init_weights()
+
+    assert model(x).shape == torch.Size([1, 768])
+    model.eval()
+    assert model(x).shape == torch.Size([1, 768])
+
+    model = VisionTransformer(
+        img_size=64,
+        num_frames=8,
+        use_learnable_pos_emb=True,
+        drop_rate=0.1,
+        use_mean_pooling=False)
+    model.init_weights()
+
+    assert model(x).shape == torch.Size([1, 768])
+    model.eval()
+    assert model(x).shape == torch.Size([1, 768])
diff --git a/tests/models/backbones/test_x3d.py b/tests/models/backbones/test_x3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f4c86b6b3e99b6ed216dd2ed3556e025f600b9e
--- /dev/null
+++ b/tests/models/backbones/test_x3d.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.models import X3D
+from mmaction.testing import check_norm_state, generate_backbone_demo_inputs
+
+
+def test_x3d_backbone():
+    """Test x3d backbone."""
+    with pytest.raises(AssertionError):
+        # In X3D: 1 <= num_stages <= 4
+        X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=2.2, num_stages=0)
+
+    with pytest.raises(AssertionError):
+        # In X3D: 1 <= num_stages <= 4
+        X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=2.2, num_stages=5)
+
+    with pytest.raises(AssertionError):
+        # len(spatial_strides) == num_stages
+        X3D(gamma_w=1.0,
+            gamma_b=2.25,
+            gamma_d=2.2,
+            spatial_strides=(1, 2),
+            num_stages=4)
+
+    with pytest.raises(AssertionError):
+        # se_style in ['half', 'all']
+        X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=2.2, se_style=None)
+
+    with pytest.raises(AssertionError):
+        # se_ratio should be None or > 0
+        X3D(gamma_w=1.0,
+            gamma_b=2.25,
+            gamma_d=2.2,
+            se_style='half',
+            se_ratio=0)
+
+    # x3d_s, no pretrained, norm_eval True
+    x3d_s = X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=2.2, norm_eval=True)
+    x3d_s.init_weights()
+    x3d_s.train()
+    assert check_norm_state(x3d_s.modules(), False)
+
+    # x3d_l, no pretrained, norm_eval True
+    x3d_l = X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=5.0, norm_eval=True)
+    x3d_l.init_weights()
+    x3d_l.train()
+    assert check_norm_state(x3d_l.modules(), False)
+
+    # x3d_s, no pretrained, norm_eval False
+    x3d_s = X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=2.2, norm_eval=False)
+    x3d_s.init_weights()
+    x3d_s.train()
+    assert check_norm_state(x3d_s.modules(), True)
+
+    # x3d_l, no pretrained, norm_eval False
+    x3d_l = X3D(gamma_w=1.0, gamma_b=2.25, gamma_d=5.0, norm_eval=False)
+    x3d_l.init_weights()
+    x3d_l.train()
+    assert check_norm_state(x3d_l.modules(), True)
+
+    # x3d_s, no pretrained, frozen_stages, norm_eval False
+    frozen_stages = 1
+    x3d_s_frozen = X3D(
+        gamma_w=1.0,
+        gamma_b=2.25,
+        gamma_d=2.2,
+        norm_eval=False,
+        frozen_stages=frozen_stages)
+
+    x3d_s_frozen.init_weights()
+    x3d_s_frozen.train()
+    assert x3d_s_frozen.conv1_t.bn.training is False
+    for param in x3d_s_frozen.conv1_s.parameters():
+        assert param.requires_grad is False
+    for param in x3d_s_frozen.conv1_t.parameters():
+        assert param.requires_grad is False
+
+    for i in range(1, frozen_stages + 1):
+        layer = getattr(x3d_s_frozen, f'layer{i}')
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # test zero_init_residual, zero_init_residual is True by default
+    for m in x3d_s_frozen.modules():
+        if hasattr(m, 'conv3'):
+            assert torch.equal(m.conv3.bn.weight,
+                               torch.zeros_like(m.conv3.bn.weight))
+            assert torch.equal(m.conv3.bn.bias,
+                               torch.zeros_like(m.conv3.bn.bias))
+
+    # x3d_s inference
+    input_shape = (1, 3, 13, 64, 64)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    # parrots 3dconv is only implemented on gpu
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            x3d_s_frozen = x3d_s_frozen.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = x3d_s_frozen(imgs_gpu)
+            assert feat.shape == torch.Size([1, 432, 13, 2, 2])
+    else:
+        feat = x3d_s_frozen(imgs)
+        assert feat.shape == torch.Size([1, 432, 13, 2, 2])
+
+    # x3d_m inference
+    input_shape = (1, 3, 16, 96, 96)
+    imgs = generate_backbone_demo_inputs(input_shape)
+    # parrots 3dconv is only implemented on gpu
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            x3d_s_frozen = x3d_s_frozen.cuda()
+            imgs_gpu = imgs.cuda()
+            feat = x3d_s_frozen(imgs_gpu)
+            assert feat.shape == torch.Size([1, 432, 16, 3, 3])
+    else:
+        feat = x3d_s_frozen(imgs)
+        assert feat.shape == torch.Size([1, 432, 16, 3, 3])
diff --git a/tests/models/common/test_conv2plus1d.py b/tests/models/common/test_conv2plus1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bc30984d0c6eceb441b87c81d8def1d99996455
--- /dev/null
+++ b/tests/models/common/test_conv2plus1d.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models.common import Conv2plus1d
+
+
+def test_conv2plus1d():
+    with pytest.raises(AssertionError):
+        # Length of kernel size, stride and padding must be the same
+        Conv2plus1d(3, 8, (2, 2))
+
+    conv_2plus1d = Conv2plus1d(3, 8, 2)
+    conv_2plus1d.init_weights()
+
+    assert torch.equal(conv_2plus1d.bn_s.weight,
+                       torch.ones_like(conv_2plus1d.bn_s.weight))
+    assert torch.equal(conv_2plus1d.bn_s.bias,
+                       torch.zeros_like(conv_2plus1d.bn_s.bias))
+
+    x = torch.rand(1, 3, 8, 256, 256)
+    output = conv_2plus1d(x)
+    assert output.shape == torch.Size([1, 8, 7, 255, 255])
diff --git a/tests/models/common/test_conv_audio.py b/tests/models/common/test_conv_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..448be07d7b1c9cd343c875449f8377d63b6ba980
--- /dev/null
+++ b/tests/models/common/test_conv_audio.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models.common import ConvAudio
+
+
+def test_conv_audio():
+    conv_audio = ConvAudio(3, 8, 3)
+    conv_audio.init_weights()
+
+    x = torch.rand(1, 3, 8, 8)
+    output = conv_audio(x)
+    assert output.shape == torch.Size([1, 16, 8, 8])
+
+    conv_audio_sum = ConvAudio(3, 8, 3, op='sum')
+    output = conv_audio_sum(x)
+    assert output.shape == torch.Size([1, 8, 8, 8])
diff --git a/tests/models/common/test_sub_batchnorm3d.py b/tests/models/common/test_sub_batchnorm3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..ade756ec083d59d6bfd006ef23301f9e17a1e75f
--- /dev/null
+++ b/tests/models/common/test_sub_batchnorm3d.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmaction.models.common import SubBatchNorm3D
+
+
+def test_SubBatchNorm3D():
+    _cfg = dict(num_splits=2)
+    num_features = 4
+    sub_batchnorm_3d = SubBatchNorm3D(num_features, **_cfg)
+    assert sub_batchnorm_3d.bn.num_features == num_features
+    assert sub_batchnorm_3d.split_bn.num_features == num_features * 2
+
+    assert sub_batchnorm_3d.bn.affine is False
+    assert sub_batchnorm_3d.split_bn.affine is False
diff --git a/tests/models/common/test_tam.py b/tests/models/common/test_tam.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee72498a36c7c4f71ec73dc375b4de1412271ada
--- /dev/null
+++ b/tests/models/common/test_tam.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models.common import TAM
+
+
+def test_TAM():
+    """test TAM."""
+    with pytest.raises(AssertionError):
+        # alpha must be a positive integer
+        TAM(16, 8, alpha=0, beta=4)
+
+    with pytest.raises(AssertionError):
+        # beta must be a positive integer
+        TAM(16, 8, alpha=2, beta=0)
+
+    with pytest.raises(AssertionError):
+        # the channels number of x should be equal to self.in_channels of TAM
+        tam = TAM(16, 8)
+        x = torch.rand(64, 8, 112, 112)
+        tam(x)
+
+    tam = TAM(16, 8)
+    x = torch.rand(32, 16, 112, 112)
+    output = tam(x)
+    assert output.shape == torch.Size([32, 16, 112, 112])
diff --git a/tests/models/common/test_transformer.py b/tests/models/common/test_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f2ae2329922532d3ec71cf2371f5b7d710ce5b7
--- /dev/null
+++ b/tests/models/common/test_transformer.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.testing import assert_params_all_zeros
+
+from mmaction.models.common import (DividedSpatialAttentionWithNorm,
+                                    DividedTemporalAttentionWithNorm,
+                                    FFNWithNorm)
+
+
+def test_divided_temporal_attention_with_norm():
+    _cfg = dict(embed_dims=768, num_heads=12, num_frames=8)
+    divided_temporal_attention = DividedTemporalAttentionWithNorm(**_cfg)
+    assert isinstance(divided_temporal_attention.norm, nn.LayerNorm)
+    assert assert_params_all_zeros(divided_temporal_attention.temporal_fc)
+
+    x = torch.rand(1, 1 + 8 * 14 * 14, 768)
+    output = divided_temporal_attention(x)
+    assert output.shape == torch.Size([1, 1 + 8 * 14 * 14, 768])
+
+
+def test_divided_spatial_attention_with_norm():
+    _cfg = dict(embed_dims=512, num_heads=8, num_frames=4, dropout_layer=None)
+    divided_spatial_attention = DividedSpatialAttentionWithNorm(**_cfg)
+    assert isinstance(divided_spatial_attention.dropout_layer, nn.Identity)
+    assert isinstance(divided_spatial_attention.norm, nn.LayerNorm)
+
+    x = torch.rand(1, 1 + 4 * 14 * 14, 512)
+    output = divided_spatial_attention(x)
+    assert output.shape == torch.Size([1, 1 + 4 * 14 * 14, 512])
+
+
+def test_ffn_with_norm():
+    _cfg = dict(
+        embed_dims=256, feedforward_channels=256 * 2, norm_cfg=dict(type='LN'))
+    ffn_with_norm = FFNWithNorm(**_cfg)
+    assert isinstance(ffn_with_norm.norm, nn.LayerNorm)
+
+    x = torch.rand(1, 1 + 4 * 14 * 14, 256)
+    output = ffn_with_norm(x)
+    assert output.shape == torch.Size([1, 1 + 4 * 14 * 14, 256])
diff --git a/tests/models/data_preprocessors/__init__.py b/tests/models/data_preprocessors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/tests/models/data_preprocessors/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/tests/models/data_preprocessors/test_data_preprocessor.py b/tests/models/data_preprocessors/test_data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7b482d2f06177c41f2576131d6df8546498cab0
--- /dev/null
+++ b/tests/models/data_preprocessors/test_data_preprocessor.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+
+import pytest
+import torch
+from numpy.testing import assert_array_equal
+
+from mmaction.models import ActionDataPreprocessor
+from mmaction.structures import ActionDataSample
+from mmaction.utils import register_all_modules
+
+
+def generate_dummy_data(batch_size, input_shape):
+    data = {
+        'inputs':
+        [torch.randint(0, 255, input_shape) for _ in range(batch_size)],
+        'data_samples':
+        [ActionDataSample().set_gt_label(2) for _ in range(batch_size)]
+    }
+    return data
+
+
+def test_data_preprocessor():
+    with pytest.raises(ValueError):
+        ActionDataPreprocessor(
+            mean=[1, 1], std=[0, 0], format_shape='NCTHW_Heatmap')
+    with pytest.raises(ValueError):
+        psr = ActionDataPreprocessor(format_shape='NCTHW_Heatmap', to_rgb=True)
+        psr(generate_dummy_data(1, (3, 224, 224)))
+
+    raw_data = generate_dummy_data(2, (1, 3, 8, 224, 224))
+    psr = ActionDataPreprocessor(
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW')
+    data = psr(deepcopy(raw_data))
+    assert data['inputs'].shape == (2, 1, 3, 8, 224, 224)
+    assert_array_equal(data['inputs'][0],
+                       (raw_data['inputs'][0] - psr.mean) / psr.std)
+    assert_array_equal(data['inputs'][1],
+                       (raw_data['inputs'][1] - psr.mean) / psr.std)
+
+    psr = ActionDataPreprocessor(format_shape='NCTHW', to_rgb=True)
+    data = psr(deepcopy(raw_data))
+    assert data['inputs'].shape == (2, 1, 3, 8, 224, 224)
+    assert_array_equal(data['inputs'][0], raw_data['inputs'][0][:, [2, 1, 0]])
+    assert_array_equal(data['inputs'][1], raw_data['inputs'][1][:, [2, 1, 0]])
+
+    register_all_modules()
+    psr = ActionDataPreprocessor(
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW',
+        blending=dict(type='MixupBlending', num_classes=5))
+    data = psr(deepcopy(raw_data), training=True)
+    assert data['data_samples'][0].gt_label.shape == (5, )
+    assert data['data_samples'][1].gt_label.shape == (5, )
+
+    raw_data = generate_dummy_data(2, (1, 3, 224, 224))
+    psr = ActionDataPreprocessor(
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCHW',
+        to_rgb=True)
+    data = psr(deepcopy(raw_data))
+    assert_array_equal(data['inputs'][0],
+                       (raw_data['inputs'][0][:, [2, 1, 0]] - psr.mean) /
+                       psr.std)
+    assert_array_equal(data['inputs'][1],
+                       (raw_data['inputs'][1][:, [2, 1, 0]] - psr.mean) /
+                       psr.std)
+
+    psr = ActionDataPreprocessor()
+    data = psr(deepcopy(raw_data))
+    assert data['inputs'].shape == (2, 1, 3, 224, 224)
+    assert_array_equal(data['inputs'][0], raw_data['inputs'][0])
+    assert_array_equal(data['inputs'][1], raw_data['inputs'][1])
+
+    raw_2d_data = generate_dummy_data(2, (3, 224, 224))
+    raw_3d_data = generate_dummy_data(2, (1, 3, 8, 224, 224))
+    raw_data = (raw_2d_data, raw_3d_data)
+
+    psr = ActionDataPreprocessor(
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='MIX2d3d')
+    data = psr(raw_data)
+    assert_array_equal(data[0]['inputs'][0],
+                       (raw_2d_data['inputs'][0] - psr.mean.view(-1, 1, 1)) /
+                       psr.std.view(-1, 1, 1))
+    assert_array_equal(data[0]['inputs'][1],
+                       (raw_2d_data['inputs'][1] - psr.mean.view(-1, 1, 1)) /
+                       psr.std.view(-1, 1, 1))
+    assert_array_equal(data[1]['inputs'][0],
+                       (raw_3d_data['inputs'][0] - psr.mean) / psr.std)
+    assert_array_equal(data[1]['inputs'][1],
+                       (raw_3d_data['inputs'][1] - psr.mean) / psr.std)
+
+    raw_data = generate_dummy_data(2, (77, ))
+    psr = ActionDataPreprocessor(to_float32=False)
+    data = psr(raw_data)
+    assert data['inputs'].dtype == raw_data['inputs'][0].dtype
diff --git a/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py b/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..054d1056a6d3f3a91c7e14ce9361e01185b466e3
--- /dev/null
+++ b/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import pytest
+import torch
+from numpy.testing import assert_array_equal
+
+from mmaction.models import MultiModalDataPreprocessor
+from mmaction.structures import ActionDataSample
+from mmaction.utils import register_all_modules
+
+
+def generate_dummy_data(batch_size, input_keys, input_shapes):
+    data = dict()
+    data['data_samples'] = [
+        ActionDataSample().set_gt_label(2) for _ in range(batch_size)
+    ]
+    data['inputs'] = dict()
+    for key, shape in zip(input_keys, input_shapes):
+        data['inputs'][key] = [
+            torch.randint(0, 255, shape) for _ in range(batch_size)
+        ]
+
+    return data
+
+
+def test_multimodal_data_preprocessor():
+    with pytest.raises(AssertionError):
+        MultiModalDataPreprocessor(
+            preprocessors=dict(imgs=dict(format_shape='NCTHW')))
+
+    register_all_modules()
+    data_keys = ('imgs', 'heatmap_imgs')
+    data_shapes = ((1, 3, 8, 224, 224), (1, 17, 32, 64, 64))
+    raw_data = generate_dummy_data(2, data_keys, data_shapes)
+
+    psr = MultiModalDataPreprocessor(
+        preprocessors=dict(
+            imgs=dict(
+                type='ActionDataPreprocessor',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                format_shape='NCTHW'),
+            heatmap_imgs=dict(type='ActionDataPreprocessor')))
+
+    data = psr(copy.deepcopy(raw_data))
+    assert data['inputs']['imgs'].shape == (2, 1, 3, 8, 224, 224)
+    assert data['inputs']['heatmap_imgs'].shape == (2, 1, 17, 32, 64, 64)
+    psr_imgs = psr.preprocessors['imgs']
+    assert_array_equal(data['inputs']['imgs'][0],
+                       (raw_data['inputs']['imgs'][0] - psr_imgs.mean) /
+                       psr_imgs.std)
+    assert_array_equal(data['inputs']['imgs'][1],
+                       (raw_data['inputs']['imgs'][1] - psr_imgs.mean) /
+                       psr_imgs.std)
+    assert_array_equal(data['inputs']['heatmap_imgs'][0],
+                       raw_data['inputs']['heatmap_imgs'][0])
+    assert_array_equal(data['inputs']['heatmap_imgs'][1],
+                       raw_data['inputs']['heatmap_imgs'][1])
+
+    data_keys = ('imgs_2D', 'imgs_3D')
+    data_shapes = ((1, 3, 224, 224), (1, 3, 8, 224, 224))
+    raw_data = generate_dummy_data(2, data_keys, data_shapes)
+
+    psr = MultiModalDataPreprocessor(
+        preprocessors=dict(
+            imgs_2D=dict(
+                type='ActionDataPreprocessor',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                format_shape='NCHW'),
+            imgs_3D=dict(
+                type='ActionDataPreprocessor',
+                mean=[127.5, 127.5, 127.5],
+                std=[57.5, 57.5, 57.5],
+                format_shape='NCTHW')))
+
+    data = psr(copy.deepcopy(raw_data))
+    assert data['inputs']['imgs_2D'].shape == (2, 1, 3, 224, 224)
+    assert data['inputs']['imgs_3D'].shape == (2, 1, 3, 8, 224, 224)
+    psr_imgs2d = psr.preprocessors['imgs_2D']
+    psr_imgs3d = psr.preprocessors['imgs_3D']
+    assert_array_equal(data['inputs']['imgs_2D'][0],
+                       (raw_data['inputs']['imgs_2D'][0] - psr_imgs2d.mean) /
+                       psr_imgs2d.std)
+    assert_array_equal(data['inputs']['imgs_2D'][1],
+                       (raw_data['inputs']['imgs_2D'][1] - psr_imgs2d.mean) /
+                       psr_imgs2d.std)
+    assert_array_equal(data['inputs']['imgs_3D'][0],
+                       (raw_data['inputs']['imgs_3D'][0] - psr_imgs3d.mean) /
+                       psr_imgs3d.std)
+    assert_array_equal(data['inputs']['imgs_3D'][1],
+                       (raw_data['inputs']['imgs_3D'][1] - psr_imgs3d.mean) /
+                       psr_imgs3d.std)
diff --git a/tests/models/heads/test_feature_head.py b/tests/models/heads/test_feature_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f13574f5ed6efdb0eac0a8f6415f93da46539c7
--- /dev/null
+++ b/tests/models/heads/test_feature_head.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import pytest
+import torch
+
+from mmaction.models import FeatureHead
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_recognizer_cfg
+from mmaction.utils import register_all_modules
+
+
+class TestFeatureHead(TestCase):
+
+    def test_2d_recognizer(self):
+        register_all_modules()
+        config = get_recognizer_cfg(
+            'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'  # noqa: E501
+        )
+        config.model['backbone']['pretrained'] = None
+        config.model['cls_head'] = dict(
+            type='FeatureHead', average_clips='score')
+
+        recognizer = MODELS.build(config.model)
+
+        input_shape = [3, 3, 32, 32]
+        data_batch = {
+            'inputs': [torch.randint(0, 256, input_shape)],
+            'data_samples': [ActionDataSample().set_gt_label(2)]
+        }
+        feat = recognizer.test_step(data_batch)
+        assert isinstance(feat, torch.Tensor)
+        assert feat.shape == torch.Size([1, 2048])
+
+    def test_3d_recognizer(self):
+        register_all_modules()
+        config = get_recognizer_cfg(
+            'slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py')
+        config.model['backbone']['pretrained'] = None
+        config.model['backbone']['pretrained2d'] = False
+        config.model['cls_head'] = dict(
+            type='FeatureHead', average_clips='score')
+
+        recognizer = MODELS.build(config.model)
+        input_shape = [1, 3, 4, 32, 32]
+        data_batch = {
+            'inputs': [torch.randint(0, 256, input_shape)],
+            'data_samples': [ActionDataSample().set_gt_label(2)]
+        }
+        feat = recognizer.test_step(data_batch)
+        assert isinstance(feat, torch.Tensor)
+        assert feat.shape == torch.Size([1, 2048])
+
+    def test_3d_backbone(self):
+        with pytest.raises(NotImplementedError):
+            head = FeatureHead(spatial_type='test')
+
+        head = FeatureHead(average_clips='score')
+        x = torch.rand(1, 64, 2, 7, 7)
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 64])
+
+        head = FeatureHead(spatial_type=None, average_clips='score')
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 64, 7, 7])
+
+        head = FeatureHead(temporal_type=None, average_clips='score')
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 64, 2])
+
+        head = FeatureHead(
+            spatial_type=None, temporal_type=None, average_clips='score')
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 64, 2, 7, 7])
+
+    def test_slowfast_backbone(self):
+        head = FeatureHead(backbone_name='slowfast', average_clips='score')
+        x_slow = torch.rand(1, 64, 2, 7, 7)
+        x_fast = torch.rand(1, 32, 6, 7, 7)
+        x = (x_slow, x_fast)
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 96])
+
+        head = FeatureHead(
+            backbone_name='slowfast', spatial_type=None, average_clips='score')
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 96, 7, 7])
+
+        with pytest.raises(AssertionError):
+            head = FeatureHead(
+                backbone_name='slowfast',
+                temporal_type=None,
+                average_clips='score')
+            feat = head(x)
+
+    def test_2d_backbone(self):
+        head = FeatureHead(average_clips='score')
+        x = torch.rand(2, 64, 7, 7)
+        with pytest.raises(AssertionError):
+            feat = head(x)
+
+        feat = head(x, num_segs=2)
+        assert feat.shape == torch.Size([1, 64])
+
+        x = torch.rand(2, 64, 7, 7)
+        head = FeatureHead(spatial_type=None, average_clips='score')
+        feat = head(x, num_segs=2)
+        assert feat.shape == torch.Size([1, 64, 7, 7])
+
+        head = FeatureHead(temporal_type=None, average_clips='score')
+        feat = head(x, num_segs=2)
+        assert feat.shape == torch.Size([1, 2, 64])
+
+    def test_tsm_backbone(self):
+        head = FeatureHead(backbone_name='tsm', average_clips='score')
+        x = torch.rand(2, 64, 7, 7)
+        with pytest.raises(AssertionError):
+            feat = head(x)
+        with pytest.raises(AssertionError):
+            feat = head(x, num_segs=2)
+
+        head = FeatureHead(num_segments=2, average_clips='score')
+        feat = head(x, num_segs=2)
+        assert feat.shape == torch.Size([1, 64])
+
+        x = torch.rand(2, 64, 7, 7)
+        head = FeatureHead(
+            num_segments=2, spatial_type=None, average_clips='score')
+        feat = head(x, num_segs=2)
+        assert feat.shape == torch.Size([1, 64, 7, 7])
+
+    def test_gcn_backbone(self):
+        # N, M, C, T, V
+        head = FeatureHead(backbone_name='gcn', average_clips='score')
+        x = torch.rand(1, 5, 64, 2, 7)
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 64])
diff --git a/tests/models/heads/test_gcn_head.py b/tests/models/heads/test_gcn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..20d1f534ea3211fe98fea797bc90ec63c7b79a11
--- /dev/null
+++ b/tests/models/heads/test_gcn_head.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models import GCNHead
+
+
+def test_gcn_head():
+    """Test GCNHead."""
+    with pytest.raises(AssertionError):
+        GCNHead(4, 5)(torch.rand((1, 2, 6, 75, 17)))
+
+    gcn_head = GCNHead(num_classes=60, in_channels=256)
+    gcn_head.init_weights()
+    feat = torch.rand(1, 2, 256, 75, 25)
+    cls_scores = gcn_head(feat)
+    assert gcn_head.num_classes == 60
+    assert gcn_head.in_channels == 256
+    assert cls_scores.shape == torch.Size([1, 60])
+
+    gcn_head = GCNHead(num_classes=60, in_channels=256, dropout=0.1)
+    gcn_head.init_weights()
+    feat = torch.rand(1, 2, 256, 75, 25)
+    cls_scores = gcn_head(feat)
+    assert gcn_head.num_classes == 60
+    assert gcn_head.in_channels == 256
+    assert cls_scores.shape == torch.Size([1, 60])
diff --git a/tests/models/heads/test_i3d_head.py b/tests/models/heads/test_i3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3632849afd90fa41e30dbdca971fe3cb8491fac
--- /dev/null
+++ b/tests/models/heads/test_i3d_head.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.models import I3DHead
+
+
+def test_i3d_head():
+    """Test loss method, layer construction, attributes and forward function in
+    i3d head."""
+    i3d_head = I3DHead(num_classes=4, in_channels=2048)
+    i3d_head.init_weights()
+
+    assert i3d_head.num_classes == 4
+    assert i3d_head.dropout_ratio == 0.5
+    assert i3d_head.in_channels == 2048
+    assert i3d_head.init_std == 0.01
+
+    assert isinstance(i3d_head.dropout, nn.Dropout)
+    assert i3d_head.dropout.p == i3d_head.dropout_ratio
+
+    assert isinstance(i3d_head.fc_cls, nn.Linear)
+    assert i3d_head.fc_cls.in_features == i3d_head.in_channels
+    assert i3d_head.fc_cls.out_features == i3d_head.num_classes
+
+    assert isinstance(i3d_head.avg_pool, nn.AdaptiveAvgPool3d)
+    assert i3d_head.avg_pool.output_size == (1, 1, 1)
+
+    input_shape = (3, 2048, 4, 7, 7)
+    feat = torch.rand(input_shape)
+
+    # i3d head inference
+    cls_scores = i3d_head(feat)
+    assert cls_scores.shape == torch.Size([3, 4])
diff --git a/tests/models/heads/test_mvit_head.py b/tests/models/heads/test_mvit_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..95873cdbe9a7870f3b99f2f770bc6bc6e8839583
--- /dev/null
+++ b/tests/models/heads/test_mvit_head.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+import torch.nn as nn
+
+from mmaction.models import MViTHead
+
+
+class TestMViTHead(TestCase):
+    DEFAULT_ARGS = dict(in_channels=768, num_classes=5)
+    fake_feats = ([torch.rand(4, 768, 3, 2, 2), torch.rand(4, 768)], )
+
+    def test_init(self):
+        head = MViTHead(**self.DEFAULT_ARGS)
+        head.init_weights()
+        self.assertEqual(head.dropout.p, head.dropout_ratio)
+        self.assertIsInstance(head.fc_cls, nn.Linear)
+        self.assertEqual(head.num_classes, 5)
+        self.assertEqual(head.dropout_ratio, 0.5)
+        self.assertEqual(head.in_channels, 768)
+        self.assertEqual(head.init_std, 0.02)
+
+    def test_pre_logits(self):
+        head = MViTHead(**self.DEFAULT_ARGS)
+        pre_logits = head.pre_logits(self.fake_feats)
+        self.assertIs(pre_logits, self.fake_feats[-1][1])
+
+    def test_forward(self):
+        head = MViTHead(**self.DEFAULT_ARGS)
+        cls_score = head(self.fake_feats)
+        self.assertEqual(cls_score.shape, (4, 5))
diff --git a/tests/models/heads/test_omni_head.py b/tests/models/heads/test_omni_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..bea7738fb7fb425c5cccb32f5d09dcd2dba27789
--- /dev/null
+++ b/tests/models/heads/test_omni_head.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models import OmniHead
+
+
+class obj():
+
+    def __init__(self, name, value):
+        super(obj, self).__init__()
+        setattr(self, name, value)
+
+
+def testOmniHead():
+    head = OmniHead(image_classes=100, video_classes=200, in_channels=400)
+
+    image_feat = torch.randn(2, 400, 8, 8)
+    image_score = head(image_feat)
+    assert image_score.shape == torch.Size([2, 100])
+
+    video_feat = torch.randn(2, 400, 8, 8, 8)
+    video_score = head(video_feat)
+    assert video_score.shape == torch.Size([2, 200])
+
+    head = OmniHead(
+        image_classes=100,
+        video_classes=200,
+        in_channels=400,
+        video_nl_head=True)
+
+    video_feat = torch.randn(2, 400, 8, 8, 8)
+    video_score = head(video_feat)
+    assert video_score.shape == torch.Size([2, 200])
+    data_samples = [obj('gt_label', torch.tensor(1)) for _ in range(2)]
+    losses = head.loss_by_feat(video_score, data_samples)
+    assert 'loss_cls' in losses
+
+    image_feat = torch.randn(1, 400, 8, 8)
+    head.eval()
+    image_score = head(image_feat)
+    assert image_score.shape == torch.Size([1, 100])
+    data_samples = [obj('gt_label', torch.tensor(1))]
+    losses = head.loss_by_feat(image_score, data_samples)
+    assert 'loss_cls' in losses
diff --git a/tests/models/heads/test_rgbpose_head.py b/tests/models/heads/test_rgbpose_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a5e6a8fdbe19b33966e25fa097b63de56a0dbfc
--- /dev/null
+++ b/tests/models/heads/test_rgbpose_head.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.models import RGBPoseHead
+
+
+def test_rgbpose_head():
+    """Test RGBPoseHead."""
+    rgbpose_head = RGBPoseHead(
+        num_classes=4,
+        in_channels=[2048, 512],
+        dropout=dict(rgb=0.51, pose=0.49))
+    rgbpose_head.init_weights()
+
+    assert rgbpose_head.num_classes == 4
+    assert rgbpose_head.dropout == dict(rgb=0.51, pose=0.49)
+    assert rgbpose_head.in_channels == [2048, 512]
+    assert rgbpose_head.init_std == 0.01
+
+    assert isinstance(rgbpose_head.dropout_rgb, nn.Dropout)
+    assert isinstance(rgbpose_head.dropout_pose, nn.Dropout)
+    assert rgbpose_head.dropout_rgb.p == rgbpose_head.dropout['rgb']
+    assert rgbpose_head.dropout_pose.p == rgbpose_head.dropout['pose']
+
+    assert isinstance(rgbpose_head.fc_rgb, nn.Linear)
+    assert isinstance(rgbpose_head.fc_pose, nn.Linear)
+    assert rgbpose_head.fc_rgb.in_features == rgbpose_head.in_channels[0]
+    assert rgbpose_head.fc_rgb.out_features == rgbpose_head.num_classes
+    assert rgbpose_head.fc_pose.in_features == rgbpose_head.in_channels[1]
+    assert rgbpose_head.fc_pose.out_features == rgbpose_head.num_classes
+
+    assert isinstance(rgbpose_head.avg_pool, nn.AdaptiveAvgPool3d)
+    assert rgbpose_head.avg_pool.output_size == (1, 1, 1)
+
+    feat_rgb = torch.rand((2, 2048, 8, 7, 7))
+    feat_pose = torch.rand((2, 512, 32, 7, 7))
+
+    cls_scores = rgbpose_head((feat_rgb, feat_pose))
+    assert cls_scores['rgb'].shape == torch.Size([2, 4])
+    assert cls_scores['pose'].shape == torch.Size([2, 4])
diff --git a/tests/models/heads/test_slowfast_head.py b/tests/models/heads/test_slowfast_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2009d516599e8e475f6cd60825557838decdb2f
--- /dev/null
+++ b/tests/models/heads/test_slowfast_head.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.models import SlowFastHead
+
+
+def test_slowfast_head():
+    """Test loss method, layer construction, attributes and forward function in
+    slowfast head."""
+    sf_head = SlowFastHead(num_classes=4, in_channels=2304)
+    sf_head.init_weights()
+
+    assert sf_head.num_classes == 4
+    assert sf_head.dropout_ratio == 0.8
+    assert sf_head.in_channels == 2304
+    assert sf_head.init_std == 0.01
+
+    assert isinstance(sf_head.dropout, nn.Dropout)
+    assert sf_head.dropout.p == sf_head.dropout_ratio
+
+    assert isinstance(sf_head.fc_cls, nn.Linear)
+    assert sf_head.fc_cls.in_features == sf_head.in_channels
+    assert sf_head.fc_cls.out_features == sf_head.num_classes
+
+    assert isinstance(sf_head.avg_pool, nn.AdaptiveAvgPool3d)
+    assert sf_head.avg_pool.output_size == (1, 1, 1)
+
+    input_shape = (3, 2048, 32, 7, 7)
+    feat_slow = torch.rand(input_shape)
+
+    input_shape = (3, 256, 4, 7, 7)
+    feat_fast = torch.rand(input_shape)
+
+    sf_head = SlowFastHead(num_classes=4, in_channels=2304)
+    cls_scores = sf_head((feat_slow, feat_fast))
+    assert cls_scores.shape == torch.Size([3, 4])
diff --git a/tests/models/heads/test_timesformer_head.py b/tests/models/heads/test_timesformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8f860339581a3c4ba5cd7fa7b7b31581ac5544d
--- /dev/null
+++ b/tests/models/heads/test_timesformer_head.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models import TimeSformerHead
+
+
+def test_timesformer_head():
+    """Test loss method, layer construction, attributes and forward function in
+    timesformer head."""
+    timesformer_head = TimeSformerHead(num_classes=4, in_channels=64)
+    timesformer_head.init_weights()
+
+    assert timesformer_head.num_classes == 4
+    assert timesformer_head.in_channels == 64
+    assert timesformer_head.init_std == 0.02
+
+    input_shape = (2, 64)
+    feat = torch.rand(input_shape)
+
+    cls_scores = timesformer_head(feat)
+    assert cls_scores.shape == torch.Size([2, 4])
diff --git a/tests/models/heads/test_tpn_head.py b/tests/models/heads/test_tpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bde1f3ed5920b7cd8049c5dc936b4e5841686c0
--- /dev/null
+++ b/tests/models/heads/test_tpn_head.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.models import TPNHead
+
+
+def test_tpn_head():
+    """Test loss method, layer construction, attributes and forward function in
+    tpn head."""
+    tpn_head = TPNHead(num_classes=4, in_channels=2048)
+    tpn_head.init_weights()
+
+    assert hasattr(tpn_head, 'avg_pool2d')
+    assert hasattr(tpn_head, 'avg_pool3d')
+    assert isinstance(tpn_head.avg_pool3d, nn.AdaptiveAvgPool3d)
+    assert tpn_head.avg_pool3d.output_size == (1, 1, 1)
+    assert tpn_head.avg_pool2d is None
+
+    input_shape = (4, 2048, 7, 7)
+    feat = torch.rand(input_shape)
+
+    # tpn head inference with num_segs
+    num_segs = 2
+    cls_scores = tpn_head(feat, num_segs)
+    assert isinstance(tpn_head.avg_pool2d, nn.AvgPool3d)
+    assert tpn_head.avg_pool2d.kernel_size == (1, 7, 7)
+    assert cls_scores.shape == torch.Size([2, 4])
+
+    # tpn head inference with no num_segs
+    input_shape = (2, 2048, 3, 7, 7)
+    feat = torch.rand(input_shape)
+    cls_scores = tpn_head(feat)
+    assert isinstance(tpn_head.avg_pool2d, nn.AvgPool3d)
+    assert tpn_head.avg_pool2d.kernel_size == (1, 7, 7)
+    assert cls_scores.shape == torch.Size([2, 4])
diff --git a/tests/models/heads/test_trn_head.py b/tests/models/heads/test_trn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..90f9a48beabca35282723661a4839a5be1972290
--- /dev/null
+++ b/tests/models/heads/test_trn_head.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn as nn
+
+from mmaction.models import TRNHead
+
+
+def test_trn_head():
+    """Test loss method, layer construction, attributes and forward function in
+    trn head."""
+    from mmaction.models.heads.trn_head import (RelationModule,
+                                                RelationModuleMultiScale)
+    trn_head = TRNHead(num_classes=4, in_channels=2048, relation_type='TRN')
+    trn_head.init_weights()
+
+    assert trn_head.num_classes == 4
+    assert trn_head.dropout_ratio == 0.8
+    assert trn_head.in_channels == 2048
+    assert trn_head.init_std == 0.001
+    assert trn_head.spatial_type == 'avg'
+
+    relation_module = trn_head.consensus
+    assert isinstance(relation_module, RelationModule)
+    assert relation_module.hidden_dim == 256
+    assert isinstance(relation_module.classifier[3], nn.Linear)
+    assert relation_module.classifier[3].out_features == trn_head.num_classes
+
+    assert trn_head.dropout.p == trn_head.dropout_ratio
+    assert isinstance(trn_head.dropout, nn.Dropout)
+    assert isinstance(trn_head.fc_cls, nn.Linear)
+    assert trn_head.fc_cls.in_features == trn_head.in_channels
+    assert trn_head.fc_cls.out_features == trn_head.hidden_dim
+
+    assert isinstance(trn_head.avg_pool, nn.AdaptiveAvgPool2d)
+    assert trn_head.avg_pool.output_size == 1
+
+    input_shape = (8, 2048, 7, 7)
+    feat = torch.rand(input_shape)
+
+    # tsm head inference with no init
+    num_segs = input_shape[0]
+    cls_scores = trn_head(feat, num_segs)
+    assert cls_scores.shape == torch.Size([1, 4])
+
+    # tsm head inference with init
+    trn_head = TRNHead(
+        num_classes=4,
+        in_channels=2048,
+        num_segments=8,
+        relation_type='TRNMultiScale')
+    trn_head.init_weights()
+    assert isinstance(trn_head.consensus, RelationModuleMultiScale)
+    assert trn_head.consensus.scales == range(8, 1, -1)
+    cls_scores = trn_head(feat, num_segs)
+    assert cls_scores.shape == torch.Size([1, 4])
+
+    with pytest.raises(ValueError):
+        trn_head = TRNHead(
+            num_classes=4,
+            in_channels=2048,
+            num_segments=8,
+            relation_type='RelationModlue')
diff --git a/tests/models/heads/test_tsm_head.py b/tests/models/heads/test_tsm_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..70c2b8ed10106ed1c734798d19e27cdd103de707
--- /dev/null
+++ b/tests/models/heads/test_tsm_head.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.models import TSMHead
+
+
+def test_tsm_head():
+    """Test loss method, layer construction, attributes and forward function in
+    tsm head."""
+    tsm_head = TSMHead(num_classes=4, in_channels=2048)
+    tsm_head.init_weights()
+
+    assert tsm_head.num_classes == 4
+    assert tsm_head.dropout_ratio == 0.8
+    assert tsm_head.in_channels == 2048
+    assert tsm_head.init_std == 0.001
+    assert tsm_head.consensus.dim == 1
+    assert tsm_head.spatial_type == 'avg'
+
+    assert isinstance(tsm_head.dropout, nn.Dropout)
+    assert tsm_head.dropout.p == tsm_head.dropout_ratio
+
+    assert isinstance(tsm_head.fc_cls, nn.Linear)
+    assert tsm_head.fc_cls.in_features == tsm_head.in_channels
+    assert tsm_head.fc_cls.out_features == tsm_head.num_classes
+
+    assert isinstance(tsm_head.avg_pool, nn.AdaptiveAvgPool2d)
+    assert tsm_head.avg_pool.output_size == 1
+
+    input_shape = (8, 2048, 7, 7)
+    feat = torch.rand(input_shape)
+
+    # tsm head inference with no init
+    num_segs = input_shape[0]
+    cls_scores = tsm_head(feat, num_segs)
+    assert cls_scores.shape == torch.Size([1, 4])
+
+    # tsm head inference with init
+    tsm_head = TSMHead(num_classes=4, in_channels=2048, temporal_pool=True)
+    tsm_head.init_weights()
+    cls_scores = tsm_head(feat, num_segs)
+    assert cls_scores.shape == torch.Size([2, 4])
diff --git a/tests/models/heads/test_tsn_head.py b/tests/models/heads/test_tsn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..abedf4ba94189f1162413d104c098e2bc0fbd4d3
--- /dev/null
+++ b/tests/models/heads/test_tsn_head.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.models import TSNHead
+
+
+def test_tsn_head():
+    """Test loss method, layer construction, attributes and forward function in
+    tsn head."""
+    tsn_head = TSNHead(num_classes=4, in_channels=2048)
+    tsn_head.init_weights()
+
+    assert tsn_head.num_classes == 4
+    assert tsn_head.dropout_ratio == 0.4
+    assert tsn_head.in_channels == 2048
+    assert tsn_head.init_std == 0.01
+    assert tsn_head.consensus.dim == 1
+    assert tsn_head.spatial_type == 'avg'
+
+    assert isinstance(tsn_head.dropout, nn.Dropout)
+    assert tsn_head.dropout.p == tsn_head.dropout_ratio
+
+    assert isinstance(tsn_head.fc_cls, nn.Linear)
+    assert tsn_head.fc_cls.in_features == tsn_head.in_channels
+    assert tsn_head.fc_cls.out_features == tsn_head.num_classes
+
+    assert isinstance(tsn_head.avg_pool, nn.AdaptiveAvgPool2d)
+    assert tsn_head.avg_pool.output_size == (1, 1)
+
+    input_shape = (8, 2048, 7, 7)
+    feat = torch.rand(input_shape)
+
+    # tsn head inference
+    num_segs = input_shape[0]
+    cls_scores = tsn_head(feat, num_segs)
+    assert cls_scores.shape == torch.Size([1, 4])
+
+    # Test multi-class recognition
+    multi_tsn_head = TSNHead(
+        num_classes=4,
+        in_channels=2048,
+        loss_cls=dict(type='BCELossWithLogits', loss_weight=160.0),
+        multi_class=True,
+        label_smooth_eps=0.01)
+    multi_tsn_head.init_weights()
+    assert multi_tsn_head.num_classes == 4
+    assert multi_tsn_head.dropout_ratio == 0.4
+    assert multi_tsn_head.in_channels == 2048
+    assert multi_tsn_head.init_std == 0.01
+    assert multi_tsn_head.consensus.dim == 1
+
+    assert isinstance(multi_tsn_head.dropout, nn.Dropout)
+    assert multi_tsn_head.dropout.p == multi_tsn_head.dropout_ratio
+
+    assert isinstance(multi_tsn_head.fc_cls, nn.Linear)
+    assert multi_tsn_head.fc_cls.in_features == multi_tsn_head.in_channels
+    assert multi_tsn_head.fc_cls.out_features == multi_tsn_head.num_classes
+
+    assert isinstance(multi_tsn_head.avg_pool, nn.AdaptiveAvgPool2d)
+    assert multi_tsn_head.avg_pool.output_size == (1, 1)
+
+    input_shape = (8, 2048, 7, 7)
+    feat = torch.rand(input_shape)
+
+    # multi-class tsn head inference
+    num_segs = input_shape[0]
+    cls_scores = tsn_head(feat, num_segs)
+    assert cls_scores.shape == torch.Size([1, 4])
diff --git a/tests/models/heads/test_x3d_head.py b/tests/models/heads/test_x3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..96866016f5c42474271c8b0b20f8aa05555dc66c
--- /dev/null
+++ b/tests/models/heads/test_x3d_head.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.models import X3DHead
+
+
+def test_x3d_head():
+    """Test loss method, layer construction, attributes and forward function in
+    x3d head."""
+    x3d_head = X3DHead(in_channels=432, num_classes=4, fc1_bias=False)
+    x3d_head.init_weights()
+
+    assert x3d_head.num_classes == 4
+    assert x3d_head.dropout_ratio == 0.5
+    assert x3d_head.in_channels == 432
+    assert x3d_head.init_std == 0.01
+
+    assert isinstance(x3d_head.dropout, nn.Dropout)
+    assert x3d_head.dropout.p == x3d_head.dropout_ratio
+
+    assert isinstance(x3d_head.fc1, nn.Linear)
+    assert x3d_head.fc1.in_features == x3d_head.in_channels
+    assert x3d_head.fc1.out_features == x3d_head.mid_channels
+    assert x3d_head.fc1.bias is None
+
+    assert isinstance(x3d_head.fc2, nn.Linear)
+    assert x3d_head.fc2.in_features == x3d_head.mid_channels
+    assert x3d_head.fc2.out_features == x3d_head.num_classes
+
+    assert isinstance(x3d_head.pool, nn.AdaptiveAvgPool3d)
+    assert x3d_head.pool.output_size == (1, 1, 1)
+
+    input_shape = (3, 432, 4, 7, 7)
+    feat = torch.rand(input_shape)
+
+    # i3d head inference
+    cls_scores = x3d_head(feat)
+    assert cls_scores.shape == torch.Size([3, 4])
diff --git a/tests/models/localizers/__init__.py b/tests/models/localizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/tests/models/localizers/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/tests/models/localizers/test_bmn.py b/tests/models/localizers/test_bmn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b3348cb37c0d019838b14c40ba56bbd20c5b1fa
--- /dev/null
+++ b/tests/models/localizers/test_bmn.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+import numpy as np
+import pytest
+import torch
+from mmcv.transforms import to_tensor
+from mmengine.structures import InstanceData
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_localizer_cfg
+from mmaction.utils import register_all_modules
+
+register_all_modules()
+
+
+def get_localization_data_sample():
+    gt_bbox = np.array([[0.1, 0.3], [0.375, 0.625]])
+    data_sample = ActionDataSample()
+    instance_data = InstanceData()
+    instance_data['gt_bbox'] = to_tensor(gt_bbox)
+    data_sample.gt_instances = instance_data
+    data_sample.set_metainfo(
+        dict(
+            video_name='v_test',
+            duration_second=100,
+            duration_frame=960,
+            feature_frame=960))
+    return data_sample
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_bmn_loss():
+    model_cfg = get_localizer_cfg(
+        'bmn/bmn_2xb8-400x100-9e_activitynet-feature.py')
+
+    if 0 and torch.cuda.is_available():
+        raw_feature = [torch.rand(400, 100).cuda()]
+        data_samples = [get_localization_data_sample()]
+        localizer_bmn = MODELS.build(model_cfg.model).cuda()
+        losses = localizer_bmn(raw_feature, data_samples, mode='loss')
+        assert isinstance(losses, dict)
+
+    else:
+        raw_feature = [torch.rand(400, 100)]
+        data_samples = [get_localization_data_sample()]
+        localizer_bmn = MODELS.build(model_cfg.model)
+        losses = localizer_bmn(raw_feature, data_samples, mode='loss')
+        assert isinstance(losses, dict)
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_bmn_predict():
+    model_cfg = get_localizer_cfg(
+        'bmn/bmn_2xb8-400x100-9e_activitynet-feature.py')
+
+    if 0 and torch.cuda.is_available():
+        localizer_bmn = MODELS.build(model_cfg.model).cuda()
+        data_samples = [get_localization_data_sample()]
+
+        with torch.no_grad():
+            one_raw_feature = [torch.rand(400, 100).cuda()]
+            localizer_bmn(one_raw_feature, data_samples, mode='predict')
+    else:
+        localizer_bmn = MODELS.build(model_cfg.model)
+        data_samples = [get_localization_data_sample()]
+        with torch.no_grad():
+            one_raw_feature = [torch.rand(400, 100)]
+            localizer_bmn(one_raw_feature, data_samples, mode='predict')
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_bmn_tensor():
+    model_cfg = get_localizer_cfg(
+        'bmn/bmn_2xb8-400x100-9e_activitynet-feature.py')
+
+    if 0 and torch.cuda.is_available():
+        localizer_bmn = MODELS.build(model_cfg.model).cuda()
+
+        with torch.no_grad():
+            one_raw_feature = [torch.rand(400, 100).cuda()]
+            localizer_bmn(one_raw_feature, data_samples=None, mode='tensor')
+    else:
+        localizer_bmn = MODELS.build(model_cfg.model)
+        with torch.no_grad():
+            one_raw_feature = [torch.rand(400, 100)]
+            localizer_bmn(one_raw_feature, data_samples=None, mode='tensor')
diff --git a/tests/models/localizers/test_localization_utils.py b/tests/models/localizers/test_localization_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8377916480ea83eec95272ed69853469c35012eb
--- /dev/null
+++ b/tests/models/localizers/test_localization_utils.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+from mmaction.models.localizers.utils import (generate_bsp_feature,
+                                              generate_candidate_proposals,
+                                              soft_nms, temporal_iop,
+                                              temporal_iou)
+
+
+def test_temporal_iou():
+    anchors_min = np.array([0.0, 0.5])
+    anchors_max = np.array([1.0, 1.5])
+    box_min = 0.5
+    box_max = 1.0
+
+    iou = temporal_iou(anchors_min, anchors_max, box_min, box_max)
+    assert_array_equal(iou, np.array([0.5, 0.5]))
+
+
+def test_temporal_iop():
+    anchors_min = np.array([0.0, 0.5])
+    anchors_max = np.array([1.0, 1.5])
+    box_min = 0.4
+    box_max = 1.1
+
+    ioa = temporal_iop(anchors_min, anchors_max, box_min, box_max)
+    assert_array_almost_equal(ioa, np.array([0.6, 0.6]))
+
+
+def test_soft_nms():
+    proposals = np.array([[0., 1., 1., 1., 0.5, 0.5],
+                          [0., 0.4, 1., 1., 0.4, 0.4],
+                          [0., 0.95, 1., 1., 0.6, 0.6]])
+    proposal_list = soft_nms(proposals, 0.75, 0.65, 0.9, 1)
+    assert_array_equal(proposal_list, [[0., 0.95, 0.6], [0., 0.4, 0.4]])
+
+
+def test_generate_candidate_proposals():
+    video_list = [0, 1]
+    video_infos = [
+        dict(
+            video_name='v_test1',
+            duration_second=100,
+            duration_frame=1000,
+            annotations=[{
+                'segment': [30.0, 60.0],
+                'label': 'Rock climbing'
+            }],
+            feature_frame=900),
+        dict(
+            video_name='v_test2',
+            duration_second=100,
+            duration_frame=1000,
+            annotations=[{
+                'segment': [6.0, 8.0],
+                'label': 'Drinking beer'
+            }],
+            feature_frame=900)
+    ]
+    tem_results_dir = osp.normpath(
+        osp.join(osp.dirname(__file__), '../../data/tem_results'))
+    # test when tem_result_ext is not valid
+    with pytest.raises(NotImplementedError):
+        result_dict = generate_candidate_proposals(
+            video_list,
+            video_infos,
+            tem_results_dir,
+            5,
+            0.5,
+            tem_results_ext='unsupport_ext')
+    # test without result_dict
+    assert_result1 = np.array([
+        [0.1, 0.7, 0.58390868, 0.35708317, 0.20850396, 0.55555556, 0.55555556],
+        [0.1, 0.5, 0.58390868, 0.32605207, 0.19038463, 0.29411765, 0.41666667],
+        [0.1, 0.3, 0.58390868, 0.26221931, 0.15311213, 0., 0.],
+        [0.3, 0.7, 0.30626667, 0.35708317, 0.10936267, 0.83333333, 0.83333333],
+        [0.3, 0.5, 0.30626667, 0.32605207, 0.09985888, 0.45454545, 0.83333333]
+    ])
+    assert_result2 = np.array(
+        [[0.1, 0.3, 0.78390867, 0.3622193, 0.28394685, 0., 0.],
+         [0.1, 0.7, 0.78390867, 0.35708317, 0.27992059, 0., 0.],
+         [0.1, 0.5, 0.78390867, 0.32605207, 0.25559504, 0., 0.]])
+    result_dict = generate_candidate_proposals(video_list, video_infos,
+                                               tem_results_dir, 5, 0.5)
+
+    assert_array_almost_equal(result_dict['v_test1'], assert_result1)
+    assert_array_almost_equal(result_dict['v_test2'], assert_result2)
+
+    # test with result_dict
+    result_dict = {}
+    generate_candidate_proposals(
+        video_list,
+        video_infos,
+        tem_results_dir,
+        5,
+        0.5,
+        result_dict=result_dict)
+
+    assert_array_almost_equal(result_dict['v_test1'], assert_result1)
+    assert_array_almost_equal(result_dict['v_test2'], assert_result2)
+
+
+def test_generate_bsp_feature():
+    video_list = [0, 1]
+    video_infos = [
+        dict(
+            video_name='v_test1',
+            duration_second=100,
+            duration_frame=1000,
+            annotations=[{
+                'segment': [30.0, 60.0],
+                'label': 'Rock climbing'
+            }],
+            feature_frame=900),
+        dict(
+            video_name='v_test2',
+            duration_second=100,
+            duration_frame=1000,
+            annotations=[{
+                'segment': [6.0, 8.0],
+                'label': 'Drinking beer'
+            }],
+            feature_frame=900)
+    ]
+    tem_results_dir = osp.normpath(
+        osp.join(osp.dirname(__file__), '../../data/tem_results'))
+    pgm_proposals_dir = osp.normpath(
+        osp.join(osp.dirname(__file__), '../../data/proposals'))
+
+    # test when extension is not valid
+    with pytest.raises(NotImplementedError):
+        result_dict = generate_bsp_feature(
+            video_list,
+            video_infos,
+            tem_results_dir,
+            pgm_proposals_dir,
+            tem_results_ext='unsupport_ext')
+
+    with pytest.raises(NotImplementedError):
+        result_dict = generate_bsp_feature(
+            video_list,
+            video_infos,
+            tem_results_dir,
+            pgm_proposals_dir,
+            pgm_proposal_ext='unsupport_ext')
+
+    # test without result_dict
+    result_dict = generate_bsp_feature(
+        video_list, video_infos, tem_results_dir, pgm_proposals_dir, top_k=2)
+    assert_result1 = np.array(
+        [[
+            0.02633105, 0.02489364, 0.02345622, 0.0220188, 0.02058138,
+            0.01914396, 0.01770654, 0.01626912, 0.01541432, 0.01514214,
+            0.01486995, 0.01459776, 0.01432558, 0.01405339, 0.01378121,
+            0.01350902, 0.03064331, 0.02941124, 0.02817916, 0.02694709,
+            0.02571502, 0.02448295, 0.02325087, 0.0220188, 0.01432558,
+            0.01409228, 0.01385897, 0.01362567, 0.01339237, 0.01315907,
+            0.01292577, 0.01269246
+        ],
+         [
+             0.01350902, 0.01323684, 0.01296465, 0.01269246, 0.01242028,
+             0.01214809, 0.01187591, 0.01160372, 0.01154264, 0.01169266,
+             0.01184269, 0.01199271, 0.01214273, 0.01229275, 0.01244278,
+             0.0125928, 0.01432558, 0.01409228, 0.01385897, 0.01362567,
+             0.01339237, 0.01315907, 0.01292577, 0.01269246, 0.01214273,
+             0.01227132, 0.01239991, 0.0125285, 0.0126571, 0.01278569,
+             0.01291428, 0.01304287
+         ]])
+    assert_result2 = np.array(
+        [[
+            0.04133105, 0.03922697, 0.03712288, 0.0350188, 0.03291471,
+            0.03081063, 0.02870654, 0.02660246, 0.02541432, 0.02514214,
+            0.02486995, 0.02459776, 0.02432558, 0.02405339, 0.02378121,
+            0.02350902, 0.04764331, 0.04583981, 0.04403631, 0.04223281,
+            0.0404293, 0.0386258, 0.0368223, 0.0350188, 0.02432558, 0.02409228,
+            0.02385897, 0.02362567, 0.02339237, 0.02315907, 0.02292577,
+            0.02269246
+        ],
+         [
+             0.02350902, 0.02323684, 0.02296465, 0.02269246, 0.02242028,
+             0.02214809, 0.02187591, 0.02160372, 0.02120931, 0.02069266,
+             0.02017602, 0.01965937, 0.01914273, 0.01862609, 0.01810944,
+             0.0175928, 0.02432558, 0.02409228, 0.02385897, 0.02362567,
+             0.02339237, 0.02315907, 0.02292577, 0.02269246, 0.01914273,
+             0.01869989, 0.01825706, 0.01781422, 0.01737138, 0.01692854,
+             0.0164857, 0.01604287
+         ]])
+    assert_array_almost_equal(result_dict['v_test1'], assert_result1)
+    assert_array_almost_equal(result_dict['v_test2'], assert_result2)
+
+    # test with result_dict
+    result_dict = {}
+    generate_bsp_feature(
+        video_list,
+        video_infos,
+        tem_results_dir,
+        pgm_proposals_dir,
+        top_k=2,
+        result_dict=result_dict)
+    assert_array_almost_equal(result_dict['v_test1'], assert_result1)
+    assert_array_almost_equal(result_dict['v_test2'], assert_result2)
diff --git a/tests/models/localizers/test_localizers.py b/tests/models/localizers/test_localizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..a73f886cc7d7288497713ef3aabe7692cffaed84
--- /dev/null
+++ b/tests/models/localizers/test_localizers.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from mmaction.models.localizers.utils import post_processing
+
+
+def test_post_processing():
+    # test with multiple results
+    result = np.array([[0., 1., 1., 1., 0.5, 0.5], [0., 0.4, 1., 1., 0.4, 0.4],
+                       [0., 0.95, 1., 1., 0.6, 0.6]])
+    video_info = dict(
+        video_name='v_test',
+        duration_second=100,
+        duration_frame=960,
+        feature_frame=960)
+    proposal_list = post_processing(result, video_info, 0.75, 0.65, 0.9, 2, 16)
+    assert isinstance(proposal_list[0], dict)
+    assert proposal_list[0]['score'] == 0.6
+    assert proposal_list[0]['segment'] == [0., 95.0]
+    assert isinstance(proposal_list[1], dict)
+    assert proposal_list[1]['score'] == 0.4
+    assert proposal_list[1]['segment'] == [0., 40.0]
+
+    # test with only result
+    result = np.array([[0., 1., 1., 1., 0.5, 0.5]])
+    video_info = dict(
+        video_name='v_test',
+        duration_second=100,
+        duration_frame=960,
+        feature_frame=960)
+    proposal_list = post_processing(result, video_info, 0.75, 0.65, 0.9, 1, 16)
+    assert isinstance(proposal_list[0], dict)
+    assert proposal_list[0]['score'] == 0.5
+    assert proposal_list[0]['segment'] == [0., 100.0]
diff --git a/tests/models/localizers/test_pem.py b/tests/models/localizers/test_pem.py
new file mode 100644
index 0000000000000000000000000000000000000000..82c8d1d7a6864c25dfbfcfa19dac0c3b5ac3e51e
--- /dev/null
+++ b/tests/models/localizers/test_pem.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+import pytest
+import torch
+from mmengine.structures import InstanceData
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_localizer_cfg
+from mmaction.utils import register_all_modules
+
+register_all_modules()
+
+
+def get_localization_data_sample():
+    bsp_feature = torch.rand(100, 32)
+    reference_temporal_iou = torch.rand(100)
+    data_sample = ActionDataSample()
+    instance_data = InstanceData()
+    instance_data['bsp_feature'] = bsp_feature
+    instance_data['reference_temporal_iou'] = reference_temporal_iou
+    data_sample.gt_instances = instance_data
+    return data_sample
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_pem():
+    model_cfg = get_localizer_cfg(
+        'bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.py')
+
+    localizer_pem = MODELS.build(model_cfg.model)
+    raw_features = [torch.rand(100, 32)] * 8
+    data_samples = [get_localization_data_sample()] * 8
+    losses = localizer_pem(raw_features, data_samples, mode='loss')
+    assert isinstance(losses, dict)
+
+    # Test forward predict
+    tmin = torch.rand(100)
+    tmax = torch.rand(100)
+    tmin_score = torch.rand(100)
+    tmax_score = torch.rand(100)
+
+    video_meta = dict(
+        video_name='v_test',
+        duration_second=100,
+        duration_frame=1000,
+        annotations=[{
+            'segment': [0.3, 0.6],
+            'label': 'Rock climbing'
+        }],
+        feature_frame=900)
+
+    with torch.no_grad():
+        raw_feature = [torch.rand(100, 32)]
+        data_sample = get_localization_data_sample()
+        data_sample.set_metainfo(video_meta)
+        gt_instances = data_sample.gt_instances
+        gt_instances['tmin'] = tmin
+        gt_instances['tmax'] = tmax
+        gt_instances['tmin_score'] = tmin_score
+        gt_instances['tmax_score'] = tmax_score
+        data_samples = [data_sample]
+
+        localizer_pem(raw_feature, data_samples, mode='predict')
+
+    # Test forward tensor
+    with torch.no_grad():
+        raw_feature = [torch.rand(100, 32)]
+        localizer_pem(raw_feature, data_samples=None, mode='tensor')
diff --git a/tests/models/localizers/test_tem.py b/tests/models/localizers/test_tem.py
new file mode 100644
index 0000000000000000000000000000000000000000..720e63f46e76ce150539ac8b0977f8f2b9c6b6b3
--- /dev/null
+++ b/tests/models/localizers/test_tem.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+import numpy as np
+import pytest
+import torch
+from mmcv.transforms import to_tensor
+from mmengine.structures import InstanceData
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_localizer_cfg
+from mmaction.utils import register_all_modules
+
+register_all_modules()
+
+
+def get_localization_data_sample():
+    gt_bbox = np.array([[0.1, 0.3], [0.375, 0.625]])
+    data_sample = ActionDataSample()
+    instance_data = InstanceData()
+    instance_data['gt_bbox'] = to_tensor(gt_bbox)
+    data_sample.gt_instances = instance_data
+    data_sample.set_metainfo(
+        dict(
+            video_name='v_test',
+            duration_second=100,
+            duration_frame=960,
+            feature_frame=960))
+    return data_sample
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_tem():
+    model_cfg = get_localizer_cfg(
+        'bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py')
+
+    localizer_tem = MODELS.build(model_cfg.model)
+    raw_feature = torch.rand(8, 400, 100)
+    # gt_bbox = torch.Tensor([[[1.0, 3.0], [3.0, 5.0]]] * 8)
+    data_samples = [get_localization_data_sample()] * 8
+    losses = localizer_tem(raw_feature, data_samples, mode='loss')
+    assert isinstance(losses, dict)
+
+    # Test forward predict
+    with torch.no_grad():
+        for one_raw_feature in raw_feature:
+            one_raw_feature = one_raw_feature.reshape(1, 400, 100)
+            data_samples = [get_localization_data_sample()]
+            localizer_tem(one_raw_feature, data_samples, mode='predict')
diff --git a/tests/models/losses/test_binary_logistic_regression_loss.py b/tests/models/losses/test_binary_logistic_regression_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f0e835bfdffada1a6afab9805a04f4e52914e0f
--- /dev/null
+++ b/tests/models/losses/test_binary_logistic_regression_loss.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from numpy.testing import assert_array_almost_equal
+
+from mmaction.models import BinaryLogisticRegressionLoss, BMNLoss
+
+
+def test_binary_logistic_regression_loss():
+    bmn_loss = BMNLoss()
+
+    # test tem_loss
+    pred_start = torch.tensor([0.9, 0.1])
+    pred_end = torch.tensor([0.1, 0.9])
+    gt_start = torch.tensor([1., 0.])
+    gt_end = torch.tensor([0., 1.])
+    output_tem_loss = bmn_loss.tem_loss(pred_start, pred_end, gt_start, gt_end)
+    binary_logistic_regression_loss = BinaryLogisticRegressionLoss()
+    assert_loss = (
+        binary_logistic_regression_loss(pred_start, gt_start) +
+        binary_logistic_regression_loss(pred_end, gt_end))
+    assert_array_almost_equal(
+        output_tem_loss.numpy(), assert_loss.numpy(), decimal=4)
diff --git a/tests/models/losses/test_bmn_loss.py b/tests/models/losses/test_bmn_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcaee159f9adf72002eea6a37146d8ea677ed7a6
--- /dev/null
+++ b/tests/models/losses/test_bmn_loss.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from numpy.testing import assert_array_almost_equal
+
+from mmaction.models import BMNLoss
+
+
+def test_bmn_loss():
+    bmn_loss = BMNLoss()
+
+    # test tem_loss
+    pred_start = torch.tensor([0.9, 0.1])
+    pred_end = torch.tensor([0.1, 0.9])
+    gt_start = torch.tensor([1., 0.])
+    gt_end = torch.tensor([0., 1.])
+    output_tem_loss = bmn_loss.tem_loss(pred_start, pred_end, gt_start, gt_end)
+
+    # test pem_reg_loss
+    seed = 1
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+    pred_bm_reg = torch.tensor([[0.1, 0.99], [0.5, 0.4]])
+    gt_iou_map = torch.tensor([[0, 1.], [0, 1.]])
+    mask = torch.tensor([[0.1, 0.4], [0.4, 0.1]])
+    output_pem_reg_loss = bmn_loss.pem_reg_loss(pred_bm_reg, gt_iou_map, mask)
+    assert_array_almost_equal(
+        output_pem_reg_loss.numpy(), np.array([0.2140]), decimal=4)
+
+    # test pem_cls_loss
+    pred_bm_cls = torch.tensor([[0.1, 0.99], [0.95, 0.2]])
+    gt_iou_map = torch.tensor([[0., 1.], [0., 1.]])
+    mask = torch.tensor([[0.1, 0.4], [0.4, 0.1]])
+    output_pem_cls_loss = bmn_loss.pem_cls_loss(pred_bm_cls, gt_iou_map, mask)
+    assert_array_almost_equal(
+        output_pem_cls_loss.numpy(), np.array([1.6137]), decimal=4)
+
+    # test bmn_loss
+    pred_bm = torch.tensor([[[[0.1, 0.99], [0.5, 0.4]],
+                             [[0.1, 0.99], [0.95, 0.2]]]])
+    pred_start = torch.tensor([[0.9, 0.1]])
+    pred_end = torch.tensor([[0.1, 0.9]])
+    gt_iou_map = torch.tensor([[[0., 2.5], [0., 10.]]])
+    gt_start = torch.tensor([[1., 0.]])
+    gt_end = torch.tensor([[0., 1.]])
+    mask = torch.tensor([[0.1, 0.4], [0.4, 0.1]])
+    output_loss = bmn_loss(pred_bm, pred_start, pred_end, gt_iou_map, gt_start,
+                           gt_end, mask)
+    assert_array_almost_equal(
+        output_loss[0].numpy(),
+        output_tem_loss + 10 * output_pem_reg_loss + output_pem_cls_loss)
+    assert_array_almost_equal(output_loss[1].numpy(), output_tem_loss)
+    assert_array_almost_equal(output_loss[2].numpy(), output_pem_reg_loss)
+    assert_array_almost_equal(output_loss[3].numpy(), output_pem_cls_loss)
diff --git a/tests/models/losses/test_cross_entropy_loss.py b/tests/models/losses/test_cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..302a7c25852d92adbc4312940ef734da2d68d6b6
--- /dev/null
+++ b/tests/models/losses/test_cross_entropy_loss.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+from numpy.testing import assert_almost_equal
+
+from mmaction.models import BCELossWithLogits, CrossEntropyLoss
+
+
+def test_bce_loss_with_logits():
+    cls_scores = torch.rand((3, 4))
+    gt_labels = torch.rand((3, 4))
+
+    bce_loss_with_logits = BCELossWithLogits()
+    output_loss = bce_loss_with_logits(cls_scores, gt_labels)
+    assert torch.equal(
+        output_loss, F.binary_cross_entropy_with_logits(cls_scores, gt_labels))
+
+    weight = torch.rand(4)
+    class_weight = weight.numpy().tolist()
+    bce_loss_with_logits = BCELossWithLogits(class_weight=class_weight)
+    output_loss = bce_loss_with_logits(cls_scores, gt_labels)
+    assert torch.equal(
+        output_loss,
+        F.binary_cross_entropy_with_logits(
+            cls_scores, gt_labels, weight=weight))
+
+
+def test_cross_entropy_loss():
+    cls_scores = torch.rand((3, 4))
+    hard_gt_labels = torch.LongTensor([0, 1, 2]).squeeze()
+    soft_gt_labels = torch.FloatTensor([[1, 0, 0, 0], [0, 1, 0, 0],
+                                        [0, 0, 1, 0]]).squeeze()
+
+    # hard label without weight
+    cross_entropy_loss = CrossEntropyLoss()
+    output_loss = cross_entropy_loss(cls_scores, hard_gt_labels)
+    assert torch.equal(output_loss, F.cross_entropy(cls_scores,
+                                                    hard_gt_labels))
+
+    # hard label with class weight
+    weight = torch.rand(4)
+    class_weight = weight.numpy().tolist()
+    cross_entropy_loss = CrossEntropyLoss(class_weight=class_weight)
+    output_loss = cross_entropy_loss(cls_scores, hard_gt_labels)
+    assert torch.equal(
+        output_loss,
+        F.cross_entropy(cls_scores, hard_gt_labels, weight=weight))
+
+    # soft label without class weight
+    cross_entropy_loss = CrossEntropyLoss()
+    output_loss = cross_entropy_loss(cls_scores, soft_gt_labels)
+    assert_almost_equal(
+        output_loss.numpy(),
+        F.cross_entropy(cls_scores, hard_gt_labels).numpy(),
+        decimal=4)
+
+    # soft label with class weight
+    cross_entropy_loss = CrossEntropyLoss(class_weight=class_weight)
+    output_loss = cross_entropy_loss(cls_scores, soft_gt_labels)
+    assert_almost_equal(
+        output_loss.numpy(),
+        F.cross_entropy(cls_scores, hard_gt_labels, weight=weight).numpy(),
+        decimal=4)
diff --git a/tests/models/losses/test_hvu_loss.py b/tests/models/losses/test_hvu_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..63f577e5c4067184c66a095813fdf0679032fc94
--- /dev/null
+++ b/tests/models/losses/test_hvu_loss.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+
+from mmaction.models import HVULoss
+
+
+def test_hvu_loss():
+    pred = torch.tensor([[-1.0525, -0.7085, 0.1819, -0.8011],
+                         [0.1555, -1.5550, 0.5586, 1.9746]])
+    gt = torch.tensor([[1., 0., 0., 0.], [0., 0., 1., 1.]])
+    mask = torch.tensor([[1., 1., 0., 0.], [0., 0., 1., 1.]])
+    category_mask = torch.tensor([[1., 0.], [0., 1.]])
+    categories = ['action', 'scene']
+    category_nums = (2, 2)
+    category_loss_weights = (1, 1)
+    loss_all_nomask_sum = HVULoss(
+        categories=categories,
+        category_nums=category_nums,
+        category_loss_weights=category_loss_weights,
+        loss_type='all',
+        with_mask=False,
+        reduction='sum')
+    loss = loss_all_nomask_sum(pred, gt, mask, category_mask)
+    loss1 = F.binary_cross_entropy_with_logits(pred, gt, reduction='none')
+    loss1 = torch.sum(loss1, dim=1)
+    assert torch.eq(loss['loss_cls'], torch.mean(loss1))
+
+    loss_all_mask = HVULoss(
+        categories=categories,
+        category_nums=category_nums,
+        category_loss_weights=category_loss_weights,
+        loss_type='all',
+        with_mask=True)
+    loss = loss_all_mask(pred, gt, mask, category_mask)
+    loss1 = F.binary_cross_entropy_with_logits(pred, gt, reduction='none')
+    loss1 = torch.sum(loss1 * mask, dim=1) / torch.sum(mask, dim=1)
+    loss1 = torch.mean(loss1)
+    assert torch.eq(loss['loss_cls'], loss1)
+
+    loss_ind_mask = HVULoss(
+        categories=categories,
+        category_nums=category_nums,
+        category_loss_weights=category_loss_weights,
+        loss_type='individual',
+        with_mask=True)
+    loss = loss_ind_mask(pred, gt, mask, category_mask)
+    action_loss = F.binary_cross_entropy_with_logits(pred[:1, :2], gt[:1, :2])
+    scene_loss = F.binary_cross_entropy_with_logits(pred[1:, 2:], gt[1:, 2:])
+    loss1 = (action_loss + scene_loss) / 2
+    assert torch.eq(loss['loss_cls'], loss1)
+
+    loss_ind_nomask_sum = HVULoss(
+        categories=categories,
+        category_nums=category_nums,
+        category_loss_weights=category_loss_weights,
+        loss_type='individual',
+        with_mask=False,
+        reduction='sum')
+    loss = loss_ind_nomask_sum(pred, gt, mask, category_mask)
+    action_loss = F.binary_cross_entropy_with_logits(
+        pred[:, :2], gt[:, :2], reduction='none')
+    action_loss = torch.sum(action_loss, dim=1)
+    action_loss = torch.mean(action_loss)
+
+    scene_loss = F.binary_cross_entropy_with_logits(
+        pred[:, 2:], gt[:, 2:], reduction='none')
+    scene_loss = torch.sum(scene_loss, dim=1)
+    scene_loss = torch.mean(scene_loss)
+
+    loss1 = (action_loss + scene_loss) / 2
+    assert torch.eq(loss['loss_cls'], loss1)
diff --git a/tests/models/losses/test_ohem_hinge_loss.py b/tests/models/losses/test_ohem_hinge_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef6d0bb0d3293f62ae5ff8c8b775caf781a926ef
--- /dev/null
+++ b/tests/models/losses/test_ohem_hinge_loss.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+from numpy.testing import assert_array_almost_equal
+from torch.autograd import Variable
+
+from mmaction.models import OHEMHingeLoss
+
+
+def test_ohem_hinge_loss():
+    # test normal case
+    pred = torch.tensor([[
+        0.5161, 0.5228, 0.7748, 0.0573, 0.1113, 0.8862, 0.1752, 0.9448, 0.0253,
+        0.1009, 0.4371, 0.2232, 0.0412, 0.3487, 0.3350, 0.9294, 0.7122, 0.3072,
+        0.2942, 0.7679
+    ]],
+                        requires_grad=True)
+    gt = torch.tensor([8])
+    num_video = 1
+    loss = OHEMHingeLoss.apply(pred, gt, 1, 1.0, num_video)
+    assert_array_almost_equal(
+        loss.detach().numpy(), np.array([0.0552]), decimal=4)
+    loss.backward(Variable(torch.ones([1])))
+    assert_array_almost_equal(
+        np.array(pred.grad),
+        np.array([[
+            0., 0., 0., 0., 0., 0., 0., -1., 0., 0., 0., 0., 0., 0., 0., 0.,
+            0., 0., 0., 0.
+        ]]),
+        decimal=4)
+
+    # test error case
+    with pytest.raises(ValueError):
+        gt = torch.tensor([8, 10])
+        loss = OHEMHingeLoss.apply(pred, gt, 1, 1.0, num_video)
diff --git a/tests/models/losses/test_ssn_loss.py b/tests/models/losses/test_ssn_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..68b5b670e462b1c50b78f1995815397501fb3c37
--- /dev/null
+++ b/tests/models/losses/test_ssn_loss.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+from mmengine import ConfigDict
+
+from mmaction.models import OHEMHingeLoss, SSNLoss
+
+
+def test_ssn_loss():
+    ssn_loss = SSNLoss()
+
+    # test activity_loss
+    activity_score = torch.rand((8, 21))
+    labels = torch.LongTensor([8] * 8).squeeze()
+    activity_indexer = torch.tensor([0, 7])
+    output_activity_loss = ssn_loss.activity_loss(activity_score, labels,
+                                                  activity_indexer)
+    assert torch.equal(
+        output_activity_loss,
+        F.cross_entropy(activity_score[activity_indexer, :],
+                        labels[activity_indexer]))
+
+    # test completeness_loss
+    completeness_score = torch.rand((8, 20), requires_grad=True)
+    labels = torch.LongTensor([8] * 8).squeeze()
+    completeness_indexer = torch.tensor([0, 1, 2, 3, 4, 5, 6])
+    positive_per_video = 1
+    incomplete_per_video = 6
+    output_completeness_loss = ssn_loss.completeness_loss(
+        completeness_score, labels, completeness_indexer, positive_per_video,
+        incomplete_per_video)
+
+    pred = completeness_score[completeness_indexer, :]
+    gt = labels[completeness_indexer]
+    pred_dim = pred.size(1)
+    pred = pred.view(-1, positive_per_video + incomplete_per_video, pred_dim)
+    gt = gt.view(-1, positive_per_video + incomplete_per_video)
+    # yapf:disable
+    positive_pred = pred[:, :positive_per_video, :].contiguous().view(-1, pred_dim)  # noqa:E501
+    incomplete_pred = pred[:, positive_per_video:, :].contiguous().view(-1, pred_dim)  # noqa:E501
+    # yapf:enable
+    ohem_ratio = 0.17
+    positive_loss = OHEMHingeLoss.apply(
+        positive_pred, gt[:, :positive_per_video].contiguous().view(-1), 1,
+        1.0, positive_per_video)
+    incomplete_loss = OHEMHingeLoss.apply(
+        incomplete_pred, gt[:, positive_per_video:].contiguous().view(-1), -1,
+        ohem_ratio, incomplete_per_video)
+    num_positives = positive_pred.size(0)
+    num_incompletes = int(incomplete_pred.size(0) * ohem_ratio)
+    assert_loss = ((positive_loss + incomplete_loss) /
+                   float(num_positives + num_incompletes))
+    assert torch.equal(output_completeness_loss, assert_loss)
+
+    # test reg_loss
+    bbox_pred = torch.rand((8, 20, 2))
+    labels = torch.LongTensor([8] * 8).squeeze()
+    bbox_targets = torch.rand((8, 2))
+    regression_indexer = torch.tensor([0])
+    output_reg_loss = ssn_loss.classwise_regression_loss(
+        bbox_pred, labels, bbox_targets, regression_indexer)
+
+    pred = bbox_pred[regression_indexer, :, :]
+    gt = labels[regression_indexer]
+    reg_target = bbox_targets[regression_indexer, :]
+    class_idx = gt.data - 1
+    classwise_pred = pred[:, class_idx, :]
+    classwise_reg_pred = torch.cat((torch.diag(classwise_pred[:, :, 0]).view(
+        -1, 1), torch.diag(classwise_pred[:, :, 1]).view(-1, 1)),
+                                   dim=1)
+    assert torch.equal(
+        output_reg_loss,
+        F.smooth_l1_loss(classwise_reg_pred.view(-1), reg_target.view(-1)) * 2)
+
+    # test ssn_loss
+    proposal_type = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 2]])
+    train_cfg = ConfigDict(
+        dict(
+            ssn=dict(
+                sampler=dict(
+                    num_per_video=8,
+                    positive_ratio=1,
+                    background_ratio=1,
+                    incomplete_ratio=6,
+                    add_gt_as_proposals=True),
+                loss_weight=dict(comp_loss_weight=0.1, reg_loss_weight=0.1))))
+    output_loss = ssn_loss(activity_score, completeness_score, bbox_pred,
+                           proposal_type, labels, bbox_targets, train_cfg)
+    assert torch.equal(output_loss['loss_activity'], output_activity_loss)
+    assert torch.equal(output_loss['loss_completeness'],
+                       output_completeness_loss * 0.1)
+    assert torch.equal(output_loss['loss_reg'], output_reg_loss * 0.1)
diff --git a/tests/models/necks/__init__.py b/tests/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/tests/models/necks/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/tests/models/necks/test_tpn.py b/tests/models/necks/test_tpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..de63d1dab33b7f92f7a90098e63993f20af54385
--- /dev/null
+++ b/tests/models/necks/test_tpn.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import pytest
+import torch
+
+from mmaction.models import TPN
+from mmaction.structures import ActionDataSample
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def get_label(label_):
+    label = []
+    for idx, one_label in enumerate(label_):
+        data_sample = ActionDataSample()
+        data_sample.set_gt_label(label_[idx])
+        label.append(data_sample)
+    return label
+
+
+def test_tpn():
+    """Test TPN backbone."""
+
+    tpn_cfg = dict(
+        in_channels=(1024, 2048),
+        out_channels=1024,
+        spatial_modulation_cfg=dict(
+            in_channels=(1024, 2048), out_channels=2048),
+        temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
+        upsample_cfg=dict(scale_factor=(1, 1, 1)),
+        downsample_cfg=dict(downsample_scale=(1, 1, 1)),
+        level_fusion_cfg=dict(
+            in_channels=(1024, 1024),
+            mid_channels=(1024, 1024),
+            out_channels=2048,
+            downsample_scales=((1, 1, 1), (1, 1, 1))),
+        aux_head_cfg=dict(out_channels=400, loss_weight=0.5))
+
+    with pytest.raises(AssertionError):
+        tpn_cfg_ = copy.deepcopy(tpn_cfg)
+        tpn_cfg_['in_channels'] = list(tpn_cfg_['in_channels'])
+        TPN(**tpn_cfg_)
+
+    with pytest.raises(AssertionError):
+        tpn_cfg_ = copy.deepcopy(tpn_cfg)
+        tpn_cfg_['out_channels'] = float(tpn_cfg_['out_channels'])
+        TPN(**tpn_cfg_)
+
+    with pytest.raises(AssertionError):
+        tpn_cfg_ = copy.deepcopy(tpn_cfg)
+        tpn_cfg_['downsample_cfg']['downsample_position'] = 'unsupport'
+        TPN(**tpn_cfg_)
+
+    for k in tpn_cfg:
+        if not k.endswith('_cfg'):
+            continue
+        tpn_cfg_ = copy.deepcopy(tpn_cfg)
+        tpn_cfg_[k] = list()
+        with pytest.raises(AssertionError):
+            TPN(**tpn_cfg_)
+
+    with pytest.raises(ValueError):
+        tpn_cfg_ = copy.deepcopy(tpn_cfg)
+        tpn_cfg_['flow_type'] = 'unsupport'
+        TPN(**tpn_cfg_)
+
+    target_shape = (32, 1)
+    target_ = generate_backbone_demo_inputs(target_shape).long().squeeze()
+
+    x0_shape = (32, 1024, 1, 4, 4)
+    x1_shape = (32, 2048, 1, 2, 2)
+    x0 = generate_backbone_demo_inputs(x0_shape)
+    x1 = generate_backbone_demo_inputs(x1_shape)
+    x = [x0, x1]
+
+    # ResNetTPN with 'cascade' flow_type
+    tpn_cfg_ = copy.deepcopy(tpn_cfg)
+    tpn_cascade = TPN(**tpn_cfg_)
+    target = get_label(target_)
+    feat, loss_aux = tpn_cascade(x, target)
+    assert feat.shape == torch.Size([32, 2048, 1, 2, 2])
+    assert len(loss_aux) == 1
+
+    # ResNetTPN with 'parallel' flow_type
+    tpn_cfg_ = copy.deepcopy(tpn_cfg)
+    tpn_parallel = TPN(flow_type='parallel', **tpn_cfg_)
+    target = get_label(target_)
+    feat, loss_aux = tpn_parallel(x, target)
+    assert feat.shape == torch.Size([32, 2048, 1, 2, 2])
+    assert len(loss_aux) == 1
+
+    # ResNetTPN with 'cascade' flow_type and target is None
+    feat, loss_aux = tpn_cascade(x, None)
+    assert feat.shape == torch.Size([32, 2048, 1, 2, 2])
+    assert len(loss_aux) == 0
+
+    # ResNetTPN with 'parallel' flow_type and target is None
+    feat, loss_aux = tpn_parallel(x, None)
+    assert feat.shape == torch.Size([32, 2048, 1, 2, 2])
+    assert len(loss_aux) == 0
diff --git a/tests/models/recognizers/__init__.py b/tests/models/recognizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/tests/models/recognizers/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/tests/models/recognizers/recognizer_omni.py b/tests/models/recognizers/recognizer_omni.py
new file mode 100644
index 0000000000000000000000000000000000000000..60ac6311a68ce7d0f2f226352edcb437a64f6516
--- /dev/null
+++ b/tests/models/recognizers/recognizer_omni.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import MagicMock
+
+import torch
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_recognizer_cfg
+from mmaction.utils import register_all_modules
+
+
+def test_omni_resnet():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'omnisource/slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb.py')
+    recognizer = MODELS.build(config.model)
+
+    # test train_step
+
+    video_sample = {
+        'inputs': [
+            torch.randint(0, 255, (1, 3, 8, 224, 224)),
+            torch.randint(0, 255, (1, 3, 8, 224, 224))
+        ],
+        'data_samples': [
+            ActionDataSample().set_gt_label(2),
+            ActionDataSample().set_gt_label(2)
+        ]
+    }
+
+    image_sample = {
+        'inputs': [
+            torch.randint(0, 255, (1, 3, 224, 224)),
+            torch.randint(0, 255, (1, 3, 224, 224))
+        ],
+        'data_samples': [
+            ActionDataSample().set_gt_label(2),
+            ActionDataSample().set_gt_label(2)
+        ]
+    }
+
+    optim_wrapper = MagicMock()
+    loss_vars = recognizer.train_step([video_sample, image_sample],
+                                      optim_wrapper)
+    assert 'loss_cls_0' in loss_vars
+    assert 'loss_cls_1' in loss_vars
+
+    loss_vars = recognizer.train_step([image_sample, video_sample],
+                                      optim_wrapper)
+    assert 'loss_cls_0' in loss_vars
+    assert 'loss_cls_1' in loss_vars
+
+    # test test_step
+    with torch.no_grad():
+        predictions = recognizer.test_step(video_sample)
+    score = predictions[0].pred_score
+    assert len(predictions) == 2
+    assert torch.min(score) >= 0
+    assert torch.max(score) <= 1
diff --git a/tests/models/recognizers/test_recognizer2d.py b/tests/models/recognizers/test_recognizer2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..675874ec320337aa20ec3c589a80fee51415a674
--- /dev/null
+++ b/tests/models/recognizers/test_recognizer2d.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+from mmengine.utils import digit_version
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_recognizer_cfg
+from mmaction.utils import register_all_modules
+
+
+def train_test_step(cfg, input_shape):
+    recognizer = MODELS.build(cfg.model)
+    num_classes = cfg.model.cls_head.num_classes
+    batch_size = input_shape[0]
+    input_shape = input_shape[1:]
+    data_batch = {
+        'inputs':
+        [torch.randint(0, 256, input_shape) for i in range(batch_size)],
+        'data_samples':
+        [ActionDataSample().set_gt_label(2) for i in range(batch_size)]
+    }
+
+    # test train_step
+    optim_wrapper = MagicMock()
+    loss_vars = recognizer.train_step(data_batch, optim_wrapper)
+    assert 'loss' in loss_vars
+    assert 'loss_cls' in loss_vars
+    optim_wrapper.update_params.assert_called_once()
+
+    # test test_step
+    with torch.no_grad():
+        predictions = recognizer.test_step(data_batch)
+    score = predictions[0].pred_score
+    assert len(predictions) == batch_size
+    assert score.shape == torch.Size([num_classes])
+    assert torch.min(score) >= 0
+    assert torch.max(score) <= 1
+
+    # test twice sample + 3 crops
+    num_views = input_shape[0] * 2 * 3
+    input_shape = (num_views, *input_shape[1:])
+    data_batch['inputs'] = [torch.randint(0, 256, input_shape)]
+    with torch.no_grad():
+        predictions = recognizer.test_step(data_batch)
+    score = predictions[0].pred_score
+    assert len(predictions) == batch_size
+    assert score.shape == torch.Size([num_classes])
+
+    return loss_vars, predictions
+
+
+def test_tsn():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+
+    input_shape = (1, 3, 3, 32, 32)
+    train_test_step(config, input_shape)
+
+
+def test_tsn_mmcls_backbone():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    # test mmcls backbone
+    mmcls_backbone = dict(
+        type='mmcls.ResNeXt',
+        depth=101,
+        num_stages=4,
+        out_indices=(3, ),
+        groups=32,
+        width_per_group=4,
+        style='pytorch')
+    config.model['backbone'] = mmcls_backbone
+
+    input_shape = (1, 3, 3, 32, 32)
+    train_test_step(config, input_shape)
+
+    from mmcls.models import ResNeXt
+    mmcls_backbone['type'] = ResNeXt
+    config.model['backbone'] = mmcls_backbone
+
+    input_shape = (1, 3, 3, 32, 32)
+    train_test_step(config, input_shape)
+
+
+def test_tsn_mobileone():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'tsn/custom_backbones/tsn_imagenet-pretrained-mobileone-s4_8xb32-1x1x8-100e_kinetics400-rgb.py'  # noqa: E501
+    )
+    config.model['backbone']['init_cfg'] = None
+    input_shape = (1, 3, 3, 32, 32)
+    train_test_step(config, input_shape)
+
+
+def test_tsn_timm_backbone():
+    # test tsn from timm
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')
+    timm_backbone = dict(type='timm.efficientnet_b0', pretrained=False)
+    config.model['backbone'] = timm_backbone
+    config.model['cls_head']['in_channels'] = 1280
+
+    input_shape = (1, 3, 3, 32, 32)
+    train_test_step(config, input_shape)
+    import timm
+    if digit_version(timm.__version__) <= digit_version('0.6.7'):
+        feature_shape = 'NLC'
+    else:
+        feature_shape = 'NHWC'
+
+    timm_swin = dict(
+        type='timm.swin_base_patch4_window7_224',
+        pretrained=False,
+        feature_shape=feature_shape)
+    config.model['backbone'] = timm_swin
+    config.model['cls_head']['in_channels'] = 1024
+
+    input_shape = (1, 3, 3, 224, 224)
+    train_test_step(config, input_shape)
+
+
+def test_tsn_tv_backbone():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    # test tv backbone
+    tv_backbone = dict(type='torchvision.densenet161', pretrained=True)
+    config.model['backbone'] = tv_backbone
+    config.model['cls_head']['in_channels'] = 2208
+
+    input_shape = (1, 3, 3, 32, 32)
+    train_test_step(config, input_shape)
+
+    from torchvision.models import densenet161
+    tv_backbone = dict(type=densenet161, pretrained=True)
+    config.model['backbone'] = tv_backbone
+    config.model['cls_head']['in_channels'] = 2208
+
+    input_shape = (1, 3, 3, 32, 32)
+    train_test_step(config, input_shape)
+
+
+def test_tsm():
+    register_all_modules()
+    # test tsm-mobilenetv2
+    config = get_recognizer_cfg(
+        'tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py'  # noqa: E501
+    )
+    config.model['backbone']['pretrained'] = None
+    config.model['backbone']['pretrained2d'] = None
+
+    input_shape = (1, 8, 3, 32, 32)
+    train_test_step(config, input_shape)
+
+    # test tsm-res50
+    config = get_recognizer_cfg(
+        'tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    config.model['backbone']['pretrained2d'] = None
+
+    input_shape = (1, 8, 3, 32, 32)
+    train_test_step(config, input_shape)
+
+    # test tsm-mobileone
+    config = get_recognizer_cfg(
+        'tsm/tsm_imagenet-pretrained-mobileone-s4_8xb16-1x1x16-50e_kinetics400-rgb.py'  # noqa: E501
+    )
+    config.model['backbone']['init_cfg'] = None
+    config.model['backbone']['pretrained2d'] = None
+
+    input_shape = (1, 16, 3, 32, 32)
+    train_test_step(config, input_shape)
+
+
+def test_trn():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py')
+    config.model['backbone']['pretrained'] = None
+
+    input_shape = (1, 8, 3, 32, 32)
+    train_test_step(config, input_shape)
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_tpn():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py')
+    config.model['backbone']['pretrained'] = None
+
+    input_shape = (1, 8, 3, 64, 64)
+    train_test_step(config, input_shape)
+
+
+def test_tanet():
+    register_all_modules()
+    config = get_recognizer_cfg('tanet/tanet_imagenet-pretrained-r50_8xb8-'
+                                'dense-1x1x8-100e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+
+    input_shape = (1, 8, 3, 32, 32)
+    train_test_step(config, input_shape)
diff --git a/tests/models/recognizers/test_recognizer3d.py b/tests/models/recognizers/test_recognizer3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..657e1038504af809c57943bf82f8b6acd6ab0929
--- /dev/null
+++ b/tests/models/recognizers/test_recognizer3d.py
@@ -0,0 +1,147 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import MagicMock
+
+import torch
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_recognizer_cfg
+from mmaction.utils import register_all_modules
+
+
+def train_test_step(cfg, input_shape):
+    recognizer = MODELS.build(cfg.model)
+    num_classes = cfg.model.cls_head.num_classes
+    data_batch = {
+        'inputs': [torch.randint(0, 256, input_shape)],
+        'data_samples': [ActionDataSample().set_gt_label(2)]
+    }
+
+    # test train_step
+    optim_wrapper = MagicMock()
+    loss_vars = recognizer.train_step(data_batch, optim_wrapper)
+    assert 'loss' in loss_vars
+    assert 'loss_cls' in loss_vars
+    optim_wrapper.update_params.assert_called_once()
+
+    # test test_step
+    with torch.no_grad():
+        predictions = recognizer.test_step(data_batch)
+    score = predictions[0].pred_score
+    assert len(predictions) == 1
+    assert score.shape == torch.Size([num_classes])
+    assert torch.min(score) >= 0
+    assert torch.max(score) <= 1
+
+    # test when average_clips is None
+    recognizer.cls_head.average_clips = None
+    num_views = 3
+    input_shape = (num_views, *input_shape[1:])
+    data_batch['inputs'] = [torch.randint(0, 256, input_shape)]
+    with torch.no_grad():
+        predictions = recognizer.test_step(data_batch)
+    score = predictions[0].pred_score
+    assert len(predictions) == 1
+    assert score.shape == torch.Size([num_views, num_classes])
+
+    return loss_vars, predictions
+
+
+def test_i3d():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained2d'] = False
+    config.model['backbone']['pretrained'] = None
+    input_shape = (1, 3, 8, 64, 64)  # M C T H W
+    train_test_step(config, input_shape=input_shape)
+
+
+def test_r2plus1d():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained2d'] = False
+    config.model['backbone']['pretrained'] = None
+    config.model['backbone']['norm_cfg'] = dict(type='BN3d')
+    input_shape = (1, 3, 8, 64, 64)  # M C T H W
+    train_test_step(config, input_shape=input_shape)
+
+
+def test_slowfast():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py')
+    input_shape = (1, 3, 16, 64, 64)  # M C T H W
+    train_test_step(config, input_shape=input_shape)
+
+
+def test_csn():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'csn/ircsn_ig65m-pretrained-r152_8xb12-32x2x1-58e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained2d'] = False
+    config.model['backbone']['pretrained'] = None
+    input_shape = (1, 3, 8, 64, 64)  # M C T H W
+    train_test_step(config, input_shape=input_shape)
+
+
+def test_timesformer():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    config.model['backbone']['img_size'] = 32
+    input_shape = (1, 3, 8, 32, 32)  # M C T H W
+    train_test_step(config, input_shape=input_shape)
+
+
+def test_c3d():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    config.model['backbone']['out_dim'] = 512
+    input_shape = (1, 3, 16, 28, 28)  # M C T H W
+    train_test_step(config, input_shape=input_shape)
+
+
+def test_slowonly():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained2d'] = False
+    config.model['backbone']['pretrained'] = None
+    input_shape = (1, 3, 4, 32, 32)  # M C T H W
+    train_test_step(config, input_shape=input_shape)
+
+
+def test_tpn_slowonly():
+    register_all_modules()
+    config = get_recognizer_cfg('tpn/tpn-slowonly_imagenet-pretrained-r50_'
+                                '8xb8-8x8x1-150e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained2d'] = False
+    config.model['backbone']['pretrained'] = None
+    input_shape = (1, 3, 4, 48, 48)  # M C T H W
+    loss_vars, _ = train_test_step(config, input_shape=input_shape)
+    assert 'loss_aux' in loss_vars
+    assert loss_vars['loss_cls'] + loss_vars['loss_aux'] == loss_vars['loss']
+
+
+def test_swin():
+    register_all_modules()
+    config = get_recognizer_cfg('swin/swin-tiny-p244-w877_in1k-pre_'
+                                '8xb8-amp-32x2x1-30e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained2d'] = False
+    config.model['backbone']['pretrained'] = None
+    input_shape = (1, 3, 4, 64, 64)  # M C T H W
+    train_test_step(config, input_shape=input_shape)
+
+
+def test_c2d():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    input_shape = (1, 3, 8, 64, 64)  # M C T H W
+    train_test_step(config, input_shape=input_shape)
diff --git a/tests/models/recognizers/test_recognizer_gcn.py b/tests/models/recognizers/test_recognizer_gcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..70333a8dbb9f2a4c5d9135b3cda33e63cd00b190
--- /dev/null
+++ b/tests/models/recognizers/test_recognizer_gcn.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import MagicMock
+
+import torch
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_skeletongcn_cfg
+from mmaction.utils import register_all_modules
+
+
+def train_test_step(cfg, input_shape):
+    recognizer = MODELS.build(cfg.model)
+    num_classes = cfg.model.cls_head.num_classes
+    data_batch = {
+        'inputs': [torch.randn(input_shape)],
+        'data_samples': [ActionDataSample().set_gt_label(2)]
+    }
+
+    # test train_step
+    optim_wrapper = MagicMock()
+    loss_vars = recognizer.train_step(data_batch, optim_wrapper)
+    assert 'loss' in loss_vars
+    assert 'loss_cls' in loss_vars
+    optim_wrapper.update_params.assert_called_once()
+
+    # test test_step
+    with torch.no_grad():
+        predictions = recognizer.test_step(data_batch)
+    score = predictions[0].pred_score
+    assert len(predictions) == 1
+    assert score.shape == torch.Size([num_classes])
+    assert torch.min(score) >= 0
+    assert torch.max(score) <= 1
+
+    # test when average_clips is None
+    recognizer.cls_head.average_clips = None
+    num_clips = 3
+    input_shape = (num_clips, *input_shape[1:])
+    data_batch['inputs'] = [torch.randn(input_shape)]
+    with torch.no_grad():
+        predictions = recognizer.test_step(data_batch)
+    score = predictions[0].pred_score
+    assert len(predictions) == 1
+    assert score.shape == torch.Size([num_clips, num_classes])
+
+    return loss_vars, predictions
+
+
+def test_stgcn():
+    register_all_modules()
+    config = get_skeletongcn_cfg(
+        'stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py')
+    input_shape = (1, 2, 30, 17, 3)  # N M T V C
+    train_test_step(config, input_shape=input_shape)
+
+
+def test_agcn():
+    register_all_modules()
+    config = get_skeletongcn_cfg(
+        '2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py')
+    input_shape = (1, 2, 30, 17, 3)  # N M T V C
+    train_test_step(config, input_shape=input_shape)
+
+
+def test_stgcn_plusplus():
+    register_all_modules()
+    config = get_skeletongcn_cfg(
+        'stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py')
+    input_shape = (1, 2, 30, 17, 3)  # N M T V C
+    train_test_step(config, input_shape=input_shape)
diff --git a/tests/models/roi_heads/test_bbox_heads.py b/tests/models/roi_heads/test_bbox_heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eb6c86a1a62b56790bfecb4caf269849a21b347
--- /dev/null
+++ b/tests/models/roi_heads/test_bbox_heads.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models import BBoxHeadAVA
+
+
+def test_bbox_head_ava():
+    """Test loss method, layer construction, attributes and forward function in
+    bbox head."""
+    with pytest.raises(TypeError):
+        # topk must be None, int or tuple[int]
+        BBoxHeadAVA(background_class=True, topk=0.1)
+
+    with pytest.raises(AssertionError):
+        # topk should be smaller than num_classes
+        BBoxHeadAVA(background_class=True, num_classes=5, topk=(3, 5))
+
+    bbox_head = BBoxHeadAVA(
+        background_class=True, in_channels=10, num_classes=4, topk=1)
+    input = torch.randn([3, 10, 2, 2, 2])
+    ret = bbox_head(input)
+    assert ret.shape == (3, 4)
+
+    cls_score = torch.tensor(
+        [[0.568, -0.162, 0.273, -0.390, 0.447, 0.102, -0.409],
+         [2.388, 0.609, 0.369, 1.630, -0.808, -0.212, 0.296],
+         [0.252, -0.533, -0.644, -0.591, 0.148, 0.963, -0.525],
+         [0.134, -0.311, -0.764, -0.752, 0.656, -1.517, 0.185]])
+
+    # Test topk_to_matrix()
+    assert torch.equal(
+        BBoxHeadAVA.topk_to_matrix(cls_score[:, 1:], 1),
+        torch.tensor([[0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0],
+                      [0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0]],
+                     dtype=bool))
+    assert torch.equal(
+        BBoxHeadAVA.topk_to_matrix(cls_score[:, 1:], 2),
+        torch.tensor([[0, 1, 0, 1, 0, 0], [1, 0, 1, 0, 0, 0],
+                      [0, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 1]],
+                     dtype=bool))
+    assert torch.equal(
+        BBoxHeadAVA.topk_to_matrix(cls_score[:, 1:], 3),
+        torch.tensor([[0, 1, 0, 1, 1, 0], [1, 1, 1, 0, 0, 0],
+                      [0, 0, 0, 1, 1, 1], [1, 0, 0, 1, 0, 1]],
+                     dtype=bool))
+    assert torch.equal(
+        BBoxHeadAVA.topk_to_matrix(cls_score[:, 1:], 6),
+        torch.ones([4, 6], dtype=bool))
+
+    # Test Multi-Label Loss
+    bbox_head = BBoxHeadAVA(
+        background_class=True)  # Why is this here? isn't this redundant?
+    bbox_head.init_weights()
+    bbox_head = BBoxHeadAVA(
+        background_class=True,
+        temporal_pool_type='max',
+        spatial_pool_type='avg')
+    bbox_head.init_weights()
+
+    # test without background class
+    """
+    losses = bbox_head.loss(
+        cls_score=cls_score,
+        bbox_pred=None,
+        rois=None,
+        labels=labels,
+        label_weights=label_weights)
+    assert torch.isclose(losses['loss_action_cls'], torch.tensor(0.7162495))
+    assert torch.isclose(losses['recall@thr=0.5'], torch.tensor(0.6666666))
+    assert torch.isclose(losses['prec@thr=0.5'], torch.tensor(0.4791665))
+    assert torch.isclose(losses['recall@top3'], torch.tensor(0.75))
+    assert torch.isclose(losses['prec@top3'], torch.tensor(0.5))
+    assert torch.isclose(losses['recall@top5'], torch.tensor(1.0))
+    assert torch.isclose(losses['prec@top5'], torch.tensor(0.45))
+
+    # Test Single-Label Loss
+    bbox_head = BBoxHeadAVA(multilabel=False)
+    losses = bbox_head.loss(
+        cls_score=cls_score,
+        bbox_pred=None,
+        rois=None,
+        labels=labels,
+        label_weights=label_weights)
+    assert torch.isclose(losses['loss_action_cls'], torch.tensor(1.639561))
+    assert torch.isclose(losses['recall@thr=0.5'], torch.tensor(0.25))
+    assert torch.isclose(losses['prec@thr=0.5'], torch.tensor(0.25))
+    assert torch.isclose(losses['recall@top3'], torch.tensor(0.75))
+    assert torch.isclose(losses['prec@top3'], torch.tensor(0.5))
+    assert torch.isclose(losses['recall@top5'], torch.tensor(1.0))
+    assert torch.isclose(losses['prec@top5'], torch.tensor(0.45))
+
+    # Test ROI
+    rois = torch.tensor([[0.0, 0.1, 0.2, 0.3, 0.4], [0.0, 0.5, 0.6, 0.7, 0.8]])
+    rois[1::2] *= 380
+    rois[2::2] *= 220
+    crop_quadruple = np.array([0.1, 0.2, 0.8, 0.7])
+    cls_score = torch.tensor([0.995, 0.728])
+    img_shape = (320, 480)
+    flip = True
+
+    bbox_head = BBoxHeadAVA(multilabel=True)
+    bboxes, scores = bbox_head.get_det_bboxes(
+        rois=rois,
+        cls_score=cls_score,
+        img_shape=img_shape,
+        flip=flip,
+        crop_quadruple=crop_quadruple)
+    assert torch.all(
+        torch.isclose(
+            bboxes,
+            torch.tensor([[0.89783341, 0.20043750, 0.89816672, 0.20087500],
+                          [0.45499998, 0.69875002, 0.58166665, 0.86499995]])))
+    assert torch.all(
+        torch.isclose(scores, torch.tensor([0.73007441, 0.67436624])))
+
+    bbox_head = BBoxHeadAVA(multilabel=False)
+    bboxes, scores = bbox_head.get_det_bboxes(
+        rois=rois,
+        cls_score=cls_score,
+        img_shape=img_shape,
+        flip=flip,
+        crop_quadruple=crop_quadruple)
+    assert torch.all(
+        torch.isclose(
+            bboxes,
+            torch.tensor([[0.89783341, 0.20043750, 0.89816672, 0.20087500],
+                          [0.45499998, 0.69875002, 0.58166665, 0.86499995]])))
+    assert torch.all(torch.isclose(scores, torch.tensor([0.56636, 0.43364])))
+    """
diff --git a/tests/models/roi_heads/test_fbo_head.py b/tests/models/roi_heads/test_fbo_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..725dfe56fae4c4993bfbba116f85b11e73d95121
--- /dev/null
+++ b/tests/models/roi_heads/test_fbo_head.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import torch
+
+from mmaction.models import FBOHead
+
+
+def test_fbo_head():
+    """Test layer construction, attributes and forward function in fbo head."""
+    lfb_prefix_path = osp.normpath(
+        osp.join(osp.dirname(__file__), '../../data/lfb'))
+
+    st_feat_shape = (1, 16, 1, 8, 8)
+    st_feat = torch.rand(st_feat_shape)
+    rois = torch.randn(1, 5)
+    rois[0][0] = 0
+    img_metas = [dict(img_key='video_1, 930')]
+
+    # non local fbo
+    fbo_head = FBOHead(
+        lfb_cfg=dict(
+            lfb_prefix_path=lfb_prefix_path,
+            max_num_sampled_feat=5,
+            window_size=60,
+            lfb_channels=16,
+            dataset_modes=('unittest'),
+            device='cpu'),
+        fbo_cfg=dict(
+            type='non_local',
+            st_feat_channels=16,
+            lt_feat_channels=16,
+            latent_channels=8,
+            num_st_feat=1,
+            num_lt_feat=5 * 60,
+        ))
+    fbo_head.init_weights()
+    out = fbo_head(st_feat, rois, img_metas)
+    assert out.shape == (1, 24, 1, 1, 1)
+
+    # avg fbo
+    fbo_head = FBOHead(
+        lfb_cfg=dict(
+            lfb_prefix_path=lfb_prefix_path,
+            max_num_sampled_feat=5,
+            window_size=60,
+            lfb_channels=16,
+            dataset_modes=('unittest'),
+            device='cpu'),
+        fbo_cfg=dict(type='avg'))
+    fbo_head.init_weights()
+    out = fbo_head(st_feat, rois, img_metas)
+    assert out.shape == (1, 32, 1, 1, 1)
+
+    # max fbo
+    fbo_head = FBOHead(
+        lfb_cfg=dict(
+            lfb_prefix_path=lfb_prefix_path,
+            max_num_sampled_feat=5,
+            window_size=60,
+            lfb_channels=16,
+            dataset_modes=('unittest'),
+            device='cpu'),
+        fbo_cfg=dict(type='max'))
+    fbo_head.init_weights()
+    out = fbo_head(st_feat, rois, img_metas)
+    assert out.shape == (1, 32, 1, 1, 1)
diff --git a/tests/models/roi_heads/test_roi_extractors.py b/tests/models/roi_heads/test_roi_extractors.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4733b7e38e1829d314c44154a674912e4b97f02
--- /dev/null
+++ b/tests/models/roi_heads/test_roi_extractors.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""import torch TODO! from mmaction.models import SingleRoIExtractor3D.
+
+def test_single_roi_extractor3d():
+    roi_extractor = SingleRoIExtractor3D(
+        roi_layer_type='RoIAlign',
+        featmap_stride=16,
+        output_size=8,
+        sampling_ratio=0,
+        pool_mode='avg',
+        aligned=True,
+        with_temporal_pool=True)
+    feat = torch.randn([4, 64, 8, 16, 16])
+    rois = torch.tensor([[0., 1., 1., 6., 6.], [1., 2., 2., 7., 7.],
+                         [3., 2., 2., 9., 9.], [2., 2., 0., 10., 9.]])
+    roi_feat, feat = roi_extractor(feat, rois)
+    assert roi_feat.shape == (4, 64, 1, 8, 8)
+    assert feat.shape == (4, 64, 1, 16, 16)
+
+    feat = (torch.randn([4, 64, 8, 16, 16]), torch.randn([4, 32, 16, 16, 16]))
+    roi_feat, feat = roi_extractor(feat, rois)
+    assert roi_feat.shape == (4, 96, 1, 8, 8)
+    assert feat.shape == (4, 96, 1, 16, 16)
+
+    feat = torch.randn([4, 64, 8, 16, 16])
+    roi_extractor = SingleRoIExtractor3D(
+        roi_layer_type='RoIAlign',
+        featmap_stride=16,
+        output_size=8,
+        sampling_ratio=0,
+        pool_mode='avg',
+        aligned=True,
+        with_temporal_pool=False)
+    roi_feat, feat = roi_extractor(feat, rois)
+    assert roi_feat.shape == (4, 64, 8, 8, 8)
+    assert feat.shape == (4, 64, 8, 16, 16)
+
+    feat = (torch.randn([4, 64, 8, 16, 16]), torch.randn([4, 32, 16, 16, 16]))
+    roi_feat, feat = roi_extractor(feat, rois)
+    assert roi_feat.shape == (4, 96, 16, 8, 8)
+    assert feat.shape == (4, 96, 16, 16, 16)
+
+    feat = torch.randn([4, 64, 8, 16, 16])
+    roi_extractor = SingleRoIExtractor3D(
+        roi_layer_type='RoIAlign',
+        featmap_stride=16,
+        output_size=8,
+        sampling_ratio=0,
+        pool_mode='avg',
+        aligned=True,
+        with_temporal_pool=True,
+        with_global=True)
+    roi_feat, feat = roi_extractor(feat, rois)
+    assert roi_feat.shape == (4, 128, 1, 8, 8)
+    assert feat.shape == (4, 64, 1, 16, 16)
+"""
diff --git a/tests/models/roi_heads/test_shared_heads.py b/tests/models/roi_heads/test_shared_heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..76fe05594cb925f084606814787a227c0800d7a1
--- /dev/null
+++ b/tests/models/roi_heads/test_shared_heads.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.models import ACRNHead
+
+
+def test_acrn_head():
+    roi_feat = torch.randn(4, 16, 1, 7, 7)
+    feat = torch.randn(2, 16, 1, 16, 16)
+    rois = torch.Tensor([[0, 2.2268, 0.5926, 10.6142, 8.0029],
+                         [0, 2.2577, 0.1519, 11.6451, 8.9282],
+                         [1, 1.9874, 1.0000, 11.1585, 8.2840],
+                         [1, 3.3338, 3.7166, 8.4174, 11.2785]])
+
+    acrn_head = ACRNHead(32, 16)
+    acrn_head.init_weights()
+    new_feat = acrn_head(roi_feat, feat, rois)
+    assert new_feat.shape == (4, 16, 1, 16, 16)
+
+    acrn_head = ACRNHead(32, 16, stride=2)
+    new_feat = acrn_head(roi_feat, feat, rois)
+    assert new_feat.shape == (4, 16, 1, 8, 8)
+
+    acrn_head = ACRNHead(32, 16, stride=2, num_convs=2)
+    new_feat = acrn_head(roi_feat, feat, rois)
+    assert new_feat.shape == (4, 16, 1, 8, 8)
diff --git a/tests/models/similarity/test_adapters.py b/tests/models/similarity/test_adapters.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfa58c5efcbf13021cde7a85c1482b972c664712
--- /dev/null
+++ b/tests/models/similarity/test_adapters.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models import SimpleMeanAdapter, TransformerAdapter
+
+
+def test_transformer_adapter():
+    """Test transformer adapter."""
+    with pytest.raises(RuntimeError):
+        num_segs_model = 8
+        num_segs_features = 9
+        adapter = TransformerAdapter(
+            num_segs=num_segs_model,
+            transformer_width=64,
+            transformer_heads=8,
+            transformer_layers=2)
+        features = torch.randn(2, num_segs_features, 64)
+        adapter(features)
+
+    num_segs = 8
+    adapter = TransformerAdapter(
+        num_segs=num_segs,
+        transformer_width=64,
+        transformer_heads=8,
+        transformer_layers=2)
+    adapter.init_weights()
+    features = torch.randn(2, num_segs, 64)
+    adapted_features = adapter(features)
+    assert adapted_features.shape == torch.Size([2, 64])
+
+
+def test_simple_mean_adapter():
+    """Test simple mean adapter."""
+
+    adapter = SimpleMeanAdapter(dim=1)
+    features = torch.randn(2, 8, 64)
+    adapted_features = adapter(features)
+    assert adapted_features.shape == torch.Size([2, 64])
+
+    adapter = SimpleMeanAdapter(dim=(1, 2))
+    features = torch.randn(2, 8, 2, 64)
+    adapted_features = adapter(features)
+    assert adapted_features.shape == torch.Size([2, 64])
diff --git a/tests/models/similarity/test_clip_similarity.py b/tests/models/similarity/test_clip_similarity.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d5f7f34bdd4df0db4d297693663dc79c338deef
--- /dev/null
+++ b/tests/models/similarity/test_clip_similarity.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_similarity_cfg
+from mmaction.utils import register_all_modules
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_clip_similarity():
+    register_all_modules()
+    cfg = get_similarity_cfg(
+        'clip4clip/'
+        'clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py')
+    cfg.model.frozen_layers = -1  # no frozen layers
+    model = MODELS.build(cfg.model)
+    model.train()
+
+    data_batch = {
+        'inputs': {
+            'imgs': [torch.randint(0, 256, (2, 3, 224, 224))],
+            'text': [torch.randint(0, 49408, (77, ))]
+        },
+        'data_samples': [ActionDataSample()]
+    }
+
+    # test train_step
+    optim_wrapper = MagicMock()
+    loss_vars = model.train_step(data_batch, optim_wrapper)
+    assert 'loss' in loss_vars
+    assert 'sim_loss_v2t' in loss_vars
+    assert 'sim_loss_t2v' in loss_vars
+    optim_wrapper.update_params.assert_called_once()
+
+    # test test_step
+    with torch.no_grad():
+        predictions = model.test_step(data_batch)
+    features = predictions[0].features
+    assert len(predictions) == 1
+    assert features.video_feature.size() == (512, )
+    assert features.text_feature.size() == (512, )
+
+    # test frozen layers
+    def check_frozen_layers(mdl, frozen_layers):
+        if frozen_layers >= 0:
+            top_layers = [
+                'ln_final', 'text_projection', 'logit_scale', 'visual.ln_post',
+                'visual.proj'
+            ]
+            mid_layers = [
+                'visual.transformer.resblocks', 'transformer.resblocks'
+            ]
+
+            for name, param in mdl.clip.named_parameters():
+                if any(name.find(n) == 0 for n in top_layers):
+                    assert param.requires_grad is True
+                elif any(name.find(n) == 0 for n in mid_layers):
+                    layer_n = int(name.split('.resblocks.')[1].split('.')[0])
+                    if layer_n >= frozen_layers:
+                        assert param.requires_grad is True
+                    else:
+                        assert param.requires_grad is False
+                else:
+                    assert param.requires_grad is False
+        else:
+            assert all([p.requires_grad for p in mdl.clip.parameters()])
+
+    check_frozen_layers(model, -1)
+
+    model.frozen_layers = 0
+    model.train()
+    check_frozen_layers(model, 0)
+
+    model.frozen_layers = 6
+    model.train()
+    check_frozen_layers(model, 6)
+
+    model.frozen_layers = 12
+    model.train()
+    check_frozen_layers(model, 12)
diff --git a/tests/models/utils/__init__.py b/tests/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5df7200f243af520949fa4530efd239dbaa29d
--- /dev/null
+++ b/tests/models/utils/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/tests/models/utils/test_blending_utils.py b/tests/models/utils/test_blending_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..568a958fa9be7e0629a8e9cf61b284465098754e
--- /dev/null
+++ b/tests/models/utils/test_blending_utils.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+import torch.nn.functional as F
+from mmcv.transforms import to_tensor
+
+from mmaction.models import CutmixBlending, MixupBlending, RandomBatchAugment
+from mmaction.structures import ActionDataSample
+
+
+def get_label(label_):
+    label = []
+    for idx, one_label in enumerate(label_):
+        data_sample = ActionDataSample()
+        data_sample.set_gt_label(label_[idx])
+        label.append(data_sample)
+    return label
+
+
+def test_mixup():
+    alpha = 0.2
+    num_classes = 10
+    label = get_label([to_tensor(x) for x in range(4)])
+    mixup = MixupBlending(num_classes, alpha)
+
+    # NCHW imgs
+    imgs = torch.randn(4, 4, 3, 32, 32)
+    mixed_imgs, mixed_label = mixup(imgs, label)
+    assert mixed_imgs.shape == torch.Size((4, 4, 3, 32, 32))
+    assert len(mixed_label) == 4
+
+    # NCTHW imgs
+    imgs = torch.randn(4, 4, 2, 3, 32, 32)
+    label = get_label([to_tensor(x) for x in range(4)])
+    mixed_imgs, mixed_label = mixup(imgs, label)
+    assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32))
+    assert len(mixed_label) == 4
+
+    # multi-label with one-hot tensor as label
+    imgs = torch.randn(4, 4, 2, 3, 32, 32)
+    label = get_label(F.one_hot(torch.arange(4), num_classes=num_classes))
+    mixed_imgs, mixed_label = mixup(imgs, label)
+    assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32))
+    assert len(mixed_label) == 4
+
+
+def test_cutmix():
+    alpha = 0.2
+    num_classes = 10
+    label = get_label([to_tensor(x) for x in range(4)])
+    cutmix = CutmixBlending(num_classes, alpha)
+
+    # NCHW imgs
+    imgs = torch.randn(4, 4, 3, 32, 32)
+    mixed_imgs, mixed_label = cutmix(imgs, label)
+    assert mixed_imgs.shape == torch.Size((4, 4, 3, 32, 32))
+    assert len(mixed_label) == 4
+
+    # NCTHW imgs
+    imgs = torch.randn(4, 4, 2, 3, 32, 32)
+    label = get_label([to_tensor(x) for x in range(4)])
+    mixed_imgs, mixed_label = cutmix(imgs, label)
+    assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32))
+    assert len(mixed_label) == 4
+
+    # multi-label with one-hot tensor as label
+    imgs = torch.randn(4, 4, 2, 3, 32, 32)
+    label = get_label(F.one_hot(torch.arange(4), num_classes=num_classes))
+    mixed_imgs, mixed_label = cutmix(imgs, label)
+    assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32))
+    assert len(mixed_label) == 4
+
+
+def test_rand_blend():
+    alpha_mixup = 0.2
+    alpha_cutmix = 0.2
+    num_classes = 10
+    label = get_label([to_tensor(x) for x in range(4)])
+    blending_augs = [
+        dict(type='MixupBlending', alpha=alpha_mixup, num_classes=num_classes),
+        dict(
+            type='CutmixBlending', alpha=alpha_cutmix, num_classes=num_classes)
+    ]
+
+    # test assertion
+    with pytest.raises(AssertionError):
+        rand_mix = RandomBatchAugment(blending_augs, [0.5, 0.6])
+
+    # mixup, cutmix
+    rand_mix = RandomBatchAugment(blending_augs, probs=None)
+    assert rand_mix.probs is None
+
+    # mixup, cutmix and None
+    probs = [0.5, 0.4]
+    rand_mix = RandomBatchAugment(blending_augs, probs)
+
+    np.testing.assert_allclose(rand_mix.probs[-1], 0.1)
+
+    # test call
+    imgs = torch.randn(4, 4, 3, 32, 32)  # NCHW imgs
+    mixed_imgs, mixed_label = rand_mix(imgs, label)
+    assert mixed_imgs.shape == torch.Size((4, 4, 3, 32, 32))
+    assert len(mixed_label) == 4
+
+    imgs = torch.randn(4, 4, 2, 3, 32, 32)  # NCTHW imgs
+    label = get_label([to_tensor(x) for x in range(4)])
+    mixed_imgs, mixed_label = rand_mix(imgs, label)
+    assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32))
+    assert len(mixed_label) == 4
+
+    # multi-label with one-hot tensor as label
+    imgs = torch.randn(4, 4, 2, 3, 32, 32)
+    label = get_label(F.one_hot(torch.arange(4), num_classes=num_classes))
+    mixed_imgs, mixed_label = rand_mix(imgs, label)
+    assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32))
+    assert len(mixed_label) == 4
diff --git a/tests/models/utils/test_gradcam.py b/tests/models/utils/test_gradcam.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a2e78c3bb2e2283c2a9defd386d4b63099a95b0
--- /dev/null
+++ b/tests/models/utils/test_gradcam.py
@@ -0,0 +1,238 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+import pytest
+import torch
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_recognizer_cfg
+from mmaction.utils import register_all_modules
+from mmaction.utils.gradcam_utils import GradCAM
+
+register_all_modules()
+
+
+def _get_target_shapes(input_shape, num_classes=400, model_type='2D'):
+    if model_type not in ['2D', '3D']:
+        raise ValueError(f'Data type {model_type} is not available')
+
+    preds_target_shape = (input_shape[0], num_classes)
+    if model_type == '3D':
+        # input shape (batch_size, num_crops*num_clips, C, clip_len, H, W)
+        # target shape (batch_size*num_crops*num_clips, clip_len, H, W, C)
+        blended_imgs_target_shape = (input_shape[0] * input_shape[1],
+                                     input_shape[3], input_shape[4],
+                                     input_shape[5], input_shape[2])
+    else:
+        # input shape (batch_size, num_segments, C, H, W)
+        # target shape (batch_size, num_segments, H, W, C)
+        blended_imgs_target_shape = (input_shape[0], input_shape[1],
+                                     input_shape[3], input_shape[4],
+                                     input_shape[2])
+
+    return blended_imgs_target_shape, preds_target_shape
+
+
+def _do_test_2D_models(recognizer,
+                       target_layer_name,
+                       input_shape,
+                       num_classes=400,
+                       device='cpu'):
+    demo_data = {
+        'inputs': [torch.randint(0, 256, input_shape[1:])],
+        'data_samples': [ActionDataSample().set_gt_label(2)]
+    }
+
+    recognizer = recognizer.to(device)
+    gradcam = GradCAM(recognizer, target_layer_name)
+
+    blended_imgs_target_shape, preds_target_shape = _get_target_shapes(
+        input_shape, num_classes=num_classes, model_type='2D')
+
+    blended_imgs, preds = gradcam(demo_data)
+    assert blended_imgs.size() == blended_imgs_target_shape
+    assert preds.size() == preds_target_shape
+
+    blended_imgs, preds = gradcam(demo_data, True)
+    assert blended_imgs.size() == blended_imgs_target_shape
+    assert preds.size() == preds_target_shape
+
+
+def _do_test_3D_models(recognizer,
+                       target_layer_name,
+                       input_shape,
+                       num_classes=400):
+    blended_imgs_target_shape, preds_target_shape = _get_target_shapes(
+        input_shape, num_classes=num_classes, model_type='3D')
+    demo_data = {
+        'inputs': [torch.randint(0, 256, input_shape[1:])],
+        'data_samples': [ActionDataSample().set_gt_label(2)]
+    }
+
+    gradcam = GradCAM(recognizer, target_layer_name)
+
+    blended_imgs, preds = gradcam(demo_data)
+    assert blended_imgs.size() == blended_imgs_target_shape
+    assert preds.size() == preds_target_shape
+
+    blended_imgs, preds = gradcam(demo_data, True)
+    assert blended_imgs.size() == blended_imgs_target_shape
+    assert preds.size() == preds_target_shape
+
+
+def test_tsn():
+    config = get_recognizer_cfg(
+        'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    recognizer = MODELS.build(config.model)
+    recognizer.cfg = config
+
+    input_shape = (1, 25, 3, 32, 32)
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    _do_test_2D_models(recognizer, target_layer_name, input_shape)
+
+
+def test_i3d():
+    config = get_recognizer_cfg(
+        'i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained2d'] = False
+    config.model['backbone']['pretrained'] = None
+
+    recognizer = MODELS.build(config.model)
+    recognizer.cfg = config
+
+    input_shape = (1, 1, 3, 32, 32, 32)
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    _do_test_3D_models(recognizer, target_layer_name, input_shape)
+
+
+def test_r2plus1d():
+    config = get_recognizer_cfg(
+        'r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained2d'] = False
+    config.model['backbone']['pretrained'] = None
+    config.model['backbone']['norm_cfg'] = dict(type='BN3d')
+
+    recognizer = MODELS.build(config.model)
+    recognizer.cfg = config
+
+    input_shape = (1, 3, 3, 8, 16, 16)
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    _do_test_3D_models(recognizer, target_layer_name, input_shape)
+
+
+def test_slowfast():
+    config = get_recognizer_cfg(
+        'slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py')
+
+    recognizer = MODELS.build(config.model)
+    recognizer.cfg = config
+
+    input_shape = (1, 1, 3, 32, 32, 32)
+    target_layer_name = 'backbone/slow_path/layer4/1/relu'
+
+    _do_test_3D_models(recognizer, target_layer_name, input_shape)
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_tsm():
+    config = get_recognizer_cfg(
+        'tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    # base config
+    recognizer = MODELS.build(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 8, 3, 32, 32)
+    _do_test_2D_models(recognizer, target_layer_name, input_shape)
+
+    # test twice sample + 3 crops, 2*3*8=48
+    config.model.test_cfg = dict(average_clips='prob')
+    recognizer = MODELS.build(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 48, 3, 32, 32)
+    _do_test_2D_models(recognizer, target_layer_name, input_shape)
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_csn():
+    config = get_recognizer_cfg(
+        'csn/ipcsn_ig65m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py'  # noqa: E501
+    )
+    config.model['backbone']['pretrained2d'] = False
+    config.model['backbone']['pretrained'] = None
+
+    recognizer = MODELS.build(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 1, 3, 32, 16, 16)
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    _do_test_3D_models(recognizer, target_layer_name, input_shape)
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_tpn():
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    config = get_recognizer_cfg(
+        'tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    config.model['backbone']['num_segments'] = 4
+    config.model.test_cfg['fcn_test'] = False
+    recognizer = MODELS.build(config.model)
+    recognizer.cfg = config
+
+    input_shape = (1, 4, 3, 16, 16)
+    _do_test_2D_models(recognizer, target_layer_name, input_shape, 174)
+
+    config = get_recognizer_cfg(
+        'tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    config.model.test_cfg['fcn_test'] = False
+    recognizer = MODELS.build(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 3, 3, 4, 16, 16)
+    _do_test_3D_models(recognizer, target_layer_name, input_shape)
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_c3d():
+    config = get_recognizer_cfg(
+        'c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    recognizer = MODELS.build(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 1, 3, 16, 112, 112)
+    target_layer_name = 'backbone/conv5a/activate'
+    _do_test_3D_models(recognizer, target_layer_name, input_shape, 101)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_tin():
+    config = get_recognizer_cfg(
+        'tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    recognizer = MODELS.build(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 8, 3, 64, 64)
+    _do_test_2D_models(
+        recognizer, target_layer_name, input_shape, device='cuda:0')
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_x3d():
+    config = get_recognizer_cfg('x3d/x3d_s_13x6x1_facebook-kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    recognizer = MODELS.build(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 1, 3, 13, 16, 16)
+    target_layer_name = 'backbone/layer4/1/relu'
+    _do_test_3D_models(recognizer, target_layer_name, input_shape)
diff --git a/tests/structures/bbox/assigners/test_max_iou_assigner_ava.py b/tests/structures/bbox/assigners/test_max_iou_assigner_ava.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad2c198d09639554490a00fea478e6bb4576bcc8
--- /dev/null
+++ b/tests/structures/bbox/assigners/test_max_iou_assigner_ava.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""import os.path as osp.
+
+import torch
+
+from mmaction.datasets import AVADataset
+
+
+def test_assigner_sampler():
+    try:
+        from mmdet.core.bbox import build_assigner, build_sampler
+    except (ImportError, ModuleNotFoundError):
+        raise ImportError(
+            'Failed to import `build_assigner` and `build_sampler` '
+            'from `mmdet.core.bbox`. The two APIs are required for '
+            'the testing in `test_bbox.py`! ')
+
+    data_prefix = osp.normpath(
+        osp.join(osp.dirname(__file__), '../../../data/eval_detection'))
+    ann_file = osp.join(data_prefix, 'gt.csv')
+    label_file = osp.join(data_prefix, 'action_list.txt')
+    proposal_file = osp.join(data_prefix, 'proposal.pkl')
+    dataset = AVADataset(
+        ann_file=ann_file,
+        exclude_file=None,
+        pipeline=[],
+        label_file=label_file,
+        proposal_file=proposal_file,
+        num_classes=4)
+
+    assigner = dict(
+        type='MaxIoUAssignerAVA',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        min_pos_iou=0.5)
+    assigner = build_assigner(assigner)
+    proposal = torch.tensor(dataset[0]['proposals'])
+
+    gt_bboxes = torch.tensor(dataset[0]['gt_bboxes'])
+    gt_labels = torch.tensor(dataset[0]['gt_labels'])
+    assign_result = assigner.assign(
+        bboxes=proposal,
+        gt_bboxes=gt_bboxes,
+        gt_bboxes_ignore=None,
+        gt_labels=gt_labels)
+    assert assign_result.num_gts == 4
+    assert torch.all(
+        assign_result.gt_inds == torch.tensor([0, 0, 3, 3, 0, 0, 0, 1, 0, 0]))
+    assert torch.all(
+        torch.isclose(
+            assign_result.max_overlaps,
+            torch.tensor([
+                0.40386841, 0.47127257, 0.53544776, 0.58797631, 0.29281288,
+                0.40979504, 0.45902917, 0.50093938, 0.21560125, 0.32948171
+            ],
+                         dtype=torch.float64)))
+    assert torch.all(
+        torch.isclose(
+            assign_result.labels,
+            torch.tensor([[0., 0., 0., 0.], [0., 0., 0., 0.], [0., 1., 0., 0.],
+                          [0., 1., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.],
+                          [0., 0., 0., 0.], [0., 0., 0., 1.], [0., 0., 0., 0.],
+                          [0., 0., 0., 0.]])))
+    sampler = dict(type='RandomSampler', num=32, pos_fraction=1)
+    sampler = build_sampler(sampler)
+    sampling_result = sampler.sample(assign_result, proposal, gt_bboxes,
+                                     gt_labels)
+    assert (sampling_result.pos_inds.shape[0] ==
+            sampling_result.pos_bboxes.shape[0])
+    assert (sampling_result.neg_inds.shape[0] ==
+            sampling_result.neg_bboxes.shape[0])
+    return sampling_result
+"""
diff --git a/tests/structures/bbox/test_bbox_target.py b/tests/structures/bbox/test_bbox_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..d78853102e319f1cbde005136854e2290d60e37b
--- /dev/null
+++ b/tests/structures/bbox/test_bbox_target.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractproperty
+
+import torch
+
+from mmaction.structures import bbox_target
+
+
+def test_bbox_target():
+    pos_bboxes = torch.tensor([[0.072, 0.47, 0.84, 0.898],
+                               [0.23, 0.215, 0.781, 0.534],
+                               [0.195, 0.128, 0.643, 0.944],
+                               [0.236, 0.189, 0.689, 0.74]])
+    neg_bboxes = torch.tensor([[0.375, 0.371, 0.726, 0.804],
+                               [0.024, 0.398, 0.776, 0.719]])
+    pos_gt_labels = torch.tensor([[0., 0., 1., 0.], [0., 0., 0., 1.],
+                                  [0., 1., 0., 0.], [0., 1., 0., 0.]])
+    cfg = abstractproperty()
+    cfg.pos_weight = 0.8
+    labels, label_weights = bbox_target([pos_bboxes], [neg_bboxes],
+                                        [pos_gt_labels], cfg)
+    assert torch.all(
+        torch.isclose(
+            labels,
+            torch.tensor([[0., 0., 1., 0.], [0., 0., 0., 1.], [0., 1., 0., 0.],
+                          [0., 1., 0., 0.], [0., 0., 0., 0.], [0., 0., 0.,
+                                                               0.]])))
+    assert torch.all(
+        torch.isclose(label_weights, torch.tensor([0.8] * 4 + [1.0] * 2)))
diff --git a/tests/structures/bbox/test_transforms.py b/tests/structures/bbox/test_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..7690bd5657e7e3f1c6285e4d01d4fcb79220f760
--- /dev/null
+++ b/tests/structures/bbox/test_transforms.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmaction.structures import bbox2result
+
+
+def test_bbox2result():
+    bboxes = torch.tensor([[0.072, 0.47, 0.84, 0.898],
+                           [0.23, 0.215, 0.781, 0.534],
+                           [0.195, 0.128, 0.643, 0.944],
+                           [0.236, 0.189, 0.689, 0.74],
+                           [0.375, 0.371, 0.726, 0.804],
+                           [0.024, 0.398, 0.776, 0.719]])
+    labels = torch.tensor([[-1.650, 0.515, 0.798, 1.240],
+                           [1.368, -1.128, 0.037, -1.087],
+                           [0.481, -1.303, 0.501, -0.463],
+                           [-0.356, 0.126, -0.840, 0.438],
+                           [0.079, 1.269, -0.263, -0.538],
+                           [-0.853, 0.391, 0.103, 0.398]])
+    num_classes = 4
+    #  Test for multi-label
+    result = bbox2result(bboxes, labels, num_classes)
+    assert np.all(
+        np.isclose(
+            result[0],
+            np.array([[0.072, 0.47, 0.84, 0.898, 0.515],
+                      [0.236, 0.189, 0.689, 0.74, 0.126],
+                      [0.375, 0.371, 0.726, 0.804, 1.269],
+                      [0.024, 0.398, 0.776, 0.719, 0.391]])))
+    assert np.all(
+        np.isclose(
+            result[1],
+            np.array([[0.072, 0.47, 0.84, 0.898, 0.798],
+                      [0.23, 0.215, 0.781, 0.534, 0.037],
+                      [0.195, 0.128, 0.643, 0.944, 0.501],
+                      [0.024, 0.398, 0.776, 0.719, 0.103]])))
+    assert np.all(
+        np.isclose(
+            result[2],
+            np.array([[0.072, 0.47, 0.84, 0.898, 1.24],
+                      [0.236, 0.189, 0.689, 0.74, 0.438],
+                      [0.024, 0.398, 0.776, 0.719, 0.398]])))
+
+    # Test for single-label
+    result = bbox2result(bboxes, labels, num_classes, -1.0)
+    assert np.all(
+        np.isclose(result[0], np.array([[0.375, 0.371, 0.726, 0.804, 1.269]])))
+    assert np.all(
+        np.isclose(
+            result[1],
+            np.array([[0.23, 0.215, 0.781, 0.534, 0.037],
+                      [0.195, 0.128, 0.643, 0.944, 0.501]])))
+    assert np.all(
+        np.isclose(
+            result[2],
+            np.array([[0.072, 0.47, 0.84, 0.898, 1.240],
+                      [0.236, 0.189, 0.689, 0.74, 0.438],
+                      [0.024, 0.398, 0.776, 0.719, 0.398]])))
diff --git a/tests/utils/test_misc.py b/tests/utils/test_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8102f0dd6f61089d337b48d5affc260091d2ef4
--- /dev/null
+++ b/tests/utils/test_misc.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import platform
+from tempfile import TemporaryDirectory
+
+import pytest
+
+from mmaction.utils import frame_extract
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_frame_extract():
+    data_prefix = osp.normpath(osp.join(osp.dirname(__file__), '../data'))
+    video_path = osp.join(data_prefix, 'test.mp4')
+    with TemporaryDirectory() as tmp_dir:
+        # assign short_side
+        frame_paths, frames = frame_extract(
+            video_path, short_side=100, out_dir=tmp_dir)
+        assert osp.exists(tmp_dir) and \
+            len(os.listdir(f'{tmp_dir}/test')) == len(frame_paths)
+        assert min(frames[0].shape[:2]) == 100
+        # default short_side
+        frame_paths, frames = frame_extract(video_path, out_dir=tmp_dir)
+        assert osp.exists(tmp_dir) and \
+            len(os.listdir(f'{tmp_dir}/test')) == len(frame_paths)
diff --git a/tests/visualization/test_action_visualizer.py b/tests/visualization/test_action_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..46de15c7703b6d44f0a28ec9944008c76e027ff6
--- /dev/null
+++ b/tests/visualization/test_action_visualizer.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+import decord
+import pytest
+
+from mmaction.structures import ActionDataSample
+from mmaction.visualization import ActionVisualizer
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_visualizer():
+    video = decord.VideoReader('./demo/demo.mp4')
+    video = video.get_batch(range(32)).asnumpy()
+
+    data_sample = ActionDataSample()
+    data_sample.set_gt_label(2)
+
+    vis = ActionVisualizer()
+    vis.add_datasample('demo', video)
+    vis.add_datasample('demo', video, data_sample)
+    vis.add_datasample('demo', video, data_sample, step=1)
+    return
diff --git a/tests/visualization/test_video_backend.py b/tests/visualization/test_video_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc86a0ed60edca8417560ac769ec009ade340f1f
--- /dev/null
+++ b/tests/visualization/test_video_backend.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import platform
+import time
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import decord
+import pytest
+
+from mmaction.structures import ActionDataSample
+from mmaction.utils import register_all_modules
+from mmaction.visualization import ActionVisualizer
+
+register_all_modules()
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_local_visbackend():
+    video = decord.VideoReader('./demo/demo.mp4')
+    video = video.get_batch(range(32)).asnumpy()
+
+    data_sample = ActionDataSample()
+    data_sample.set_gt_label(2)
+    with TemporaryDirectory() as tmp_dir:
+        vis = ActionVisualizer(
+            save_dir=tmp_dir, vis_backends=[dict(type='LocalVisBackend')])
+        vis.add_datasample('demo', video, data_sample)
+        for k in range(32):
+            frame_path = osp.join(tmp_dir, 'vis_data/demo/frames_0/%d.png' % k)
+            assert Path(frame_path).exists()
+
+        vis.add_datasample('demo', video, data_sample, step=1)
+        for k in range(32):
+            frame_path = osp.join(tmp_dir, 'vis_data/demo/frames_1/%d.png' % k)
+            assert Path(frame_path).exists()
+    return
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_tensorboard_visbackend():
+    video = decord.VideoReader('./demo/demo.mp4')
+    video = video.get_batch(range(32)).asnumpy()
+
+    data_sample = ActionDataSample()
+    data_sample.set_gt_label(2)
+    with TemporaryDirectory() as tmp_dir:
+        vis = ActionVisualizer(
+            save_dir=tmp_dir,
+            vis_backends=[dict(type='TensorboardVisBackend')])
+        vis.add_datasample('demo', video, data_sample, step=1)
+
+        assert Path(osp.join(tmp_dir, 'vis_data')).exists()
+        flag = False
+        for item in os.listdir(osp.join(tmp_dir, 'vis_data')):
+            if item.startswith('events.out.tfevents.'):
+                flag = True
+                break
+        assert flag, 'Cannot find tensorboard file!'
+        # wait tensorboard store asynchronously
+        time.sleep(1)
+    return
diff --git a/tools/analysis_tools/analyze_logs.py b/tools/analysis_tools/analyze_logs.py
new file mode 100644
index 0000000000000000000000000000000000000000..9021721062c37c74e9daffcab80acb128c3564c4
--- /dev/null
+++ b/tools/analysis_tools/analyze_logs.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+
+
+def cal_train_time(log_dicts, args):
+    for i, log_dict in enumerate(log_dicts):
+        print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+        all_times = []
+        for epoch in log_dict.keys():
+            if args.include_outliers:
+                all_times.append(log_dict[epoch]['time'])
+            else:
+                all_times.append(log_dict[epoch]['time'][1:])
+        all_times = np.array(all_times)
+        epoch_ave_time = all_times.mean(-1)
+        slowest_epoch = epoch_ave_time.argmax()
+        fastest_epoch = epoch_ave_time.argmin()
+        std_over_epoch = epoch_ave_time.std()
+        print(f'slowest epoch {slowest_epoch + 1}, '
+              f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+        print(f'fastest epoch {fastest_epoch + 1}, '
+              f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+        print(f'time std over epochs is {std_over_epoch:.4f}')
+        print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+        print()
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            if metric not in log_dict[epochs[0]]:
+                raise KeyError(
+                    f'{args.json_logs[i]} does not contain metric {metric}')
+            xs = []
+            ys = []
+            for epoch in epochs:
+                iters = log_dict[epoch]['iter']
+                if log_dict[epoch]['mode'][-1] == 'val':
+                    iters = iters[:-1]
+                num_iters_per_epoch = iters[-1]
+                xs.append(np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+                ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+            xs = np.concatenate(xs)
+            ys = np.concatenate(ys)
+            plt.xlabel('iter')
+            plt.plot(xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+            plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def add_plot_parser(subparsers):
+    parser_plt = subparsers.add_parser(
+        'plot_curve', help='parser for plotting curves')
+    parser_plt.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_plt.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['top1_acc'],
+        help='the metric that you want to plot')
+    parser_plt.add_argument('--title', type=str, help='title of figure')
+    parser_plt.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser_plt.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser_plt.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser_plt.add_argument('--out', type=str, default=None)
+
+
+def add_time_parser(subparsers):
+    parser_time = subparsers.add_parser(
+        'cal_train_time',
+        help='parser for computing the average time per training iteration')
+    parser_time.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_time.add_argument(
+        '--include-outliers',
+        action='store_true',
+        help='include the first value of every epoch when computing '
+        'the average time')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    # currently only support plot curve and calculate average train time
+    subparsers = parser.add_subparsers(dest='task', help='task parser')
+    add_plot_parser(subparsers)
+    add_time_parser(subparsers)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics, e.g. memory, top1_acc
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            for line in log_file:
+                log = json.loads(line.strip())
+                # skip lines without `epoch` field
+                if 'epoch' not in log:
+                    continue
+                epoch = log.pop('epoch')
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+                for k, v in log.items():
+                    log_dict[epoch][k].append(v)
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+
+    log_dicts = load_json_logs(json_logs)
+
+    eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/bench_processing.py b/tools/analysis_tools/bench_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..e77c1956cffcc78c2c773000d1910afd701accf9
--- /dev/null
+++ b/tools/analysis_tools/bench_processing.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This file is for benchmark dataloading process. The command line to run this
+file is:
+
+$ python -m cProfile -o program.prof tools/analysis/bench_processing.py
+configs/task/method/[config filename]
+
+It use cProfile to record cpu running time and output to program.prof
+To visualize cProfile output program.prof, use Snakeviz and run:
+$ snakeviz program.prof
+"""
+import argparse
+import os
+
+import mmcv
+from mmcv import Config
+
+from mmaction import __version__
+from mmaction.datasets import build_dataloader, build_dataset
+from mmaction.utils import get_root_logger
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Benchmark dataloading')
+    parser.add_argument('config', help='train config file path')
+    args = parser.parse_args()
+    cfg = Config.fromfile(args.config)
+
+    # init logger before other steps
+    logger = get_root_logger()
+    logger.info(f'MMAction2 Version: {__version__}')
+    logger.info(f'Config: {cfg.text}')
+
+    # create bench data list
+    ann_file_bench = 'benchlist.txt'
+    if not os.path.exists(ann_file_bench):
+        with open(cfg.ann_file_train) as f:
+            lines = f.readlines()[:256]
+            with open(ann_file_bench, 'w') as f1:
+                f1.writelines(lines)
+    cfg.data.train.ann_file = ann_file_bench
+
+    dataset = build_dataset(cfg.data.train)
+    data_loader = build_dataloader(
+        dataset,
+        videos_per_gpu=cfg.data.videos_per_gpu,
+        workers_per_gpu=0,
+        persistent_workers=False,
+        num_gpus=1,
+        dist=False)
+
+    # Start progress bar after first 5 batches
+    prog_bar = mmcv.ProgressBar(
+        len(dataset) - 5 * cfg.data.videos_per_gpu, start=False)
+    for i, data in enumerate(data_loader):
+        if i == 5:
+            prog_bar.start()
+        for _ in data['imgs']:
+            if i < 5:
+                continue
+            prog_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/benchmark.py b/tools/analysis_tools/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..dacf78f731375ea25e7938e28b8dbfd2ab8e2ea5
--- /dev/null
+++ b/tools/analysis_tools/benchmark.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+
+import torch
+from mmcv import Config
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel
+from mmcv.runner.fp16_utils import wrap_fp16_model
+
+from mmaction.datasets import build_dataloader, build_dataset
+from mmaction.models import build_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMAction2 benchmark a recognizer')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        '--log-interval', default=10, help='interval of logging')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.backbone.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test, dict(test_mode=True))
+    data_loader = build_dataloader(
+        dataset,
+        videos_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        persistent_workers=cfg.data.get('persistent_workers', False),
+        dist=False,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    model = build_model(
+        cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+
+    model = MMDataParallel(model, device_ids=[0])
+
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+
+    # benchmark with 2000 video and take the average
+    for i, data in enumerate(data_loader):
+
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+
+        with torch.no_grad():
+            model(return_loss=False, **data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(
+                    f'Done video [{i + 1:<3}/ 2000], fps: {fps:.1f} video / s')
+
+        if (i + 1) == 200:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f'Overall fps: {fps:.1f} video / s')
+            break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/check_videos.py b/tools/analysis_tools/check_videos.py
new file mode 100644
index 0000000000000000000000000000000000000000..acc0210bdce759e1d57d02cb2c2f36d5872f08f1
--- /dev/null
+++ b/tools/analysis_tools/check_videos.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import warnings
+from functools import partial
+from multiprocessing import Manager, cpu_count
+
+import numpy as np
+from mmengine import Config, DictAction, track_parallel_progress
+from mmengine.registry import init_default_scope
+
+from mmaction.registry import DATASETS, TRANSFORMS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 check datasets')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function (deprecate), '
+        'change to --eval-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--output-file',
+        default='invalid-video.txt',
+        help='Output file path which keeps corrupted/missing video file paths')
+    parser.add_argument(
+        '--split',
+        default='train',
+        choices=['train', 'val', 'test'],
+        help='Dataset split')
+    parser.add_argument(
+        '--decoder',
+        default='decord',
+        choices=['decord', 'opencv', 'pyav'],
+        help='Video decoder type, should be one of [decord, opencv, pyav]')
+    parser.add_argument(
+        '--nproc',
+        type=int,
+        default=(cpu_count() - 1 or 1),
+        help='Number of processes to check videos')
+    parser.add_argument(
+        '--remove-corrupted-videos',
+        action='store_true',
+        help='Whether to delete all corrupted videos')
+    args = parser.parse_args()
+
+    if args.options and args.eval_options:
+        raise ValueError(
+            '--options and --eval-options cannot be both '
+            'specified, --options is deprecated in favor of --eval-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --eval-options')
+        args.eval_options = args.options
+    return args
+
+
+@TRANSFORMS.register_module()
+class RandomSampleFrames:
+
+    def __call__(self, results):
+        """Select frames to verify.
+
+        Select the first, last and three random frames, Required key is
+        "total_frames", added or modified key is "frame_inds".
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        assert results['total_frames'] > 0
+
+        # first and last frames
+        results['frame_inds'] = np.array([0, results['total_frames'] - 1])
+
+        # choose 3 random frames
+        if results['total_frames'] > 2:
+            results['frame_inds'] = np.concatenate([
+                results['frame_inds'],
+                np.random.randint(1, results['total_frames'] - 1, 3)
+            ])
+
+        return results
+
+
+def _do_check_videos(lock, pipeline, output_file, data_info):
+    try:
+        pipeline(data_info)
+    except:  # noqa
+        # save invalid video path to output file
+        lock.acquire()
+        with open(output_file, 'a') as f:
+            f.write(data_info['filename'] + '\n')
+        lock.release()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    decoder_to_pipeline_prefix = dict(
+        decord='Decord', opencv='OpenCV', pyav='PyAV')
+
+    # read config file
+    cfg = Config.fromfile(args.config)
+    cfg.merge_from_dict(args.cfg_options)
+    init_default_scope(cfg.get('default_scope', 'mmaction'))
+
+    # build dataset
+    dataset_cfg = cfg.get(f'{args.split}_dataloader').dataset
+    dataset_type = dataset_cfg.type
+    assert dataset_type == 'VideoDataset'
+    dataset_cfg.pipeline = [
+        dict(type=decoder_to_pipeline_prefix[args.decoder] + 'Init'),
+        dict(type='RandomSampleFrames'),
+        dict(type=decoder_to_pipeline_prefix[args.decoder] + 'Decode')
+    ]
+
+    dataset = DATASETS.build(dataset_cfg)
+    dataset_cfg.pop('type')
+    pipeline = dataset.pipeline
+
+    # prepare for checking
+    if os.path.exists(args.output_file):
+        # remove existing output file
+        os.remove(args.output_file)
+
+    lock = Manager().Lock()
+    worker_fn = partial(_do_check_videos, lock, pipeline, args.output_file)
+    # avoid copy dataset for multiprocess
+    data_info_list = [
+        dataset.get_data_info(idx) for idx in range(len(dataset))
+    ]
+
+    # start checking
+    track_parallel_progress(worker_fn, data_info_list, nproc=args.nproc)
+
+    if os.path.exists(args.output_file):
+        num_lines = sum(1 for _ in open(args.output_file))
+        print(f'Checked {len(dataset)} videos, '
+              f'{num_lines} are corrupted/missing.')
+        if args.remove_corrupted_videos:
+            print('Start deleting corrupted videos')
+            cnt = 0
+            with open(args.output_file, 'r') as f:
+                for line in f:
+                    if os.path.exists(line.strip()):
+                        os.remove(line.strip())
+                        cnt += 1
+            print(f'Deleted {cnt} corrupted videos.')
+    else:
+        print(f'Checked {len(dataset)} videos, none are corrupted/missing')
diff --git a/tools/analysis_tools/confusion_matrix.py b/tools/analysis_tools/confusion_matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..218a227bdb03c3dd105e4b2a2c64b58e42396073
--- /dev/null
+++ b/tools/analysis_tools/confusion_matrix.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import logging
+import tempfile
+
+import torch
+from mmengine import dump, list_from_file, load
+from mmengine.config import Config, DictAction
+from mmengine.evaluator import Evaluator
+from mmengine.runner import Runner
+
+from mmaction.evaluation import ConfusionMatrix
+from mmaction.registry import DATASETS
+from mmaction.utils import register_all_modules
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Eval a checkpoint and draw the confusion matrix.')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        'ckpt_or_result',
+        type=str,
+        help='The checkpoint file (.pth) or '
+        'dumpped predictions pickle file (.pkl).')
+    parser.add_argument('--out', help='the file to save the confusion matrix.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='whether to display the metric result by matplotlib if supports.')
+    parser.add_argument(
+        '--show-path', type=str, help='Path to save the visualization image.')
+    parser.add_argument(
+        '--include-values',
+        action='store_true',
+        help='To draw the values in the figure.')
+    parser.add_argument('--label-file', default=None, help='Labelmap file')
+    parser.add_argument(
+        '--target-classes',
+        type=int,
+        nargs='+',
+        default=[],
+        help='Selected classes to evaluate, and remains will be neglected')
+    parser.add_argument(
+        '--cmap',
+        type=str,
+        default='viridis',
+        help='The color map to use. Defaults to "viridis".')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # register all modules in mmaction into the registries
+    # do not init the default scope here because it will be init in the runner
+    register_all_modules(init_default_scope=False)
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    if args.ckpt_or_result.endswith('.pth'):
+        # Set confusion matrix as the metric.
+        cfg.test_evaluator = dict(type='ConfusionMatrix')
+
+        cfg.load_from = str(args.ckpt_or_result)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            cfg.work_dir = tmpdir
+            runner = Runner.from_cfg(cfg)
+            classes = runner.test_loop.dataloader.dataset.metainfo.get(
+                'classes')
+            cm = runner.test()['confusion_matrix/result']
+            logging.shutdown()
+    else:
+        predictions = load(args.ckpt_or_result)
+        evaluator = Evaluator(ConfusionMatrix())
+        metrics = evaluator.offline_evaluate(predictions, None)
+        cm = metrics['confusion_matrix/result']
+        try:
+            # Try to build the dataset.
+            dataset = DATASETS.build({
+                **cfg.test_dataloader.dataset, 'pipeline': []
+            })
+            classes = dataset.metainfo.get('classes')
+        except Exception:
+            classes = None
+
+    if args.label_file is not None:
+        classes = list_from_file(args.label_file)
+    if classes is None:
+        num_classes = cm.shape[0]
+        classes = list(range(num_classes))
+
+    if args.target_classes:
+        assert len(args.target_classes) > 1, \
+            'please ensure select more than one class'
+        target_idx = torch.tensor(args.target_classes)
+        cm = cm[target_idx][:, target_idx]
+        classes = [classes[idx] for idx in target_idx]
+
+    if args.out is not None:
+        dump(cm, args.out)
+
+    if args.show or args.show_path is not None:
+        fig = ConfusionMatrix.plot(
+            cm,
+            show=args.show,
+            classes=classes,
+            include_values=args.include_values,
+            cmap=args.cmap)
+        if args.show_path is not None:
+            fig.savefig(args.show_path)
+            print(f'The confusion matrix is saved at {args.show_path}.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/eval_metric.py b/tools/analysis_tools/eval_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..51b5156c37e557fd8d06dce67dddb7d8e30dc992
--- /dev/null
+++ b/tools/analysis_tools/eval_metric.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import mmengine
+from mmengine import Config, DictAction
+from mmengine.evaluator import Evaluator
+from mmengine.registry import init_default_scope
+from rich import print
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Evaluate metric of the '
+                                     'results saved in pkl format')
+    parser.add_argument('config', help='Config of the model')
+    parser.add_argument('pkl_results', help='Results in pickle format')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    init_default_scope(cfg.get('default_scope', 'mmaction'))
+
+    data_samples = mmengine.load(args.pkl_results)
+
+    evaluator = Evaluator(cfg.test_evaluator)
+    eval_results = evaluator.offline_evaluate(data_samples)
+    print(eval_results)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/get_flops.py b/tools/analysis_tools/get_flops.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdb8b03658f519fb5f124cb4560e6eddf3e1b45d
--- /dev/null
+++ b/tools/analysis_tools/get_flops.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+from mmengine import Config
+from mmengine.registry import init_default_scope
+
+from mmaction.registry import MODELS
+
+try:
+    from mmengine.analysis import get_model_complexity_info
+except ImportError:
+    raise ImportError('Please upgrade mmcv to >0.6.2')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Get model flops and params')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[224, 224],
+        help='input image size')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+
+    args = parse_args()
+
+    if len(args.shape) == 1:
+        input_shape = (1, 3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (1, 3) + tuple(args.shape)
+    elif len(args.shape) == 4:
+        # n, c, h, w = args.shape for 2D recognizer
+        input_shape = tuple(args.shape)
+    elif len(args.shape) == 5:
+        # n, c, t, h, w = args.shape for 3D recognizer or
+        # n, m, t, v, c = args.shape for GCN-based recognizer
+        input_shape = tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    cfg = Config.fromfile(args.config)
+    init_default_scope(cfg.get('default_scope', 'mmaction'))
+    model = MODELS.build(cfg.model)
+    model.eval()
+
+    if hasattr(model, 'extract_feat'):
+        model.forward = model.extract_feat
+    else:
+        raise NotImplementedError(
+            'FLOPs counter is currently not currently supported with {}'.
+            format(model.__class__.__name__))
+
+    analysis_results = get_model_complexity_info(model, input_shape)
+    flops = analysis_results['flops_str']
+    params = analysis_results['params_str']
+    table = analysis_results['out_table']
+    print(table)
+    split_line = '=' * 30
+    print(f'\n{split_line}\nInput shape: {input_shape}\n'
+          f'Flops: {flops}\nParams: {params}\n{split_line}')
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify that the '
+          'flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/print_config.py b/tools/analysis_tools/print_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..661d4aca561bb466496296c6d6d842b604b89dea
--- /dev/null
+++ b/tools/analysis_tools/print_config.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+from mmengine import Config, DictAction
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='arguments in dict')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    print(f'Config:\n{cfg.pretty_text}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/report_accuracy.py b/tools/analysis_tools/report_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..1360cb3bf64658887498edc566282ee43227365e
--- /dev/null
+++ b/tools/analysis_tools/report_accuracy.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import numpy as np
+from mmengine import load
+from scipy.special import softmax
+
+from mmaction.evaluation.functional import (get_weighted_score,
+                                            mean_class_accuracy,
+                                            mmit_mean_average_precision,
+                                            top_k_accuracy)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Fusing multiple scores')
+    parser.add_argument(
+        '--preds',
+        nargs='+',
+        help='list of predict result',
+        default=['demo/fuse/joint.pkl', 'demo/fuse/bone.pkl'])
+    parser.add_argument(
+        '--coefficients',
+        nargs='+',
+        type=float,
+        help='coefficients of each score file',
+        default=[1.0, 1.0])
+    parser.add_argument('--apply-softmax', action='store_true')
+    parser.add_argument(
+        '--multi-label',
+        action='store_true',
+        help='whether the task is multi label classification')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    assert len(args.preds) == len(args.coefficients)
+    data_sample_list = [load(f) for f in args.preds]
+    score_list = []
+    for data_samples in data_sample_list:
+        scores = [sample['pred_score'].numpy() for sample in data_samples]
+        score_list.append(scores)
+
+    if args.multi_label:
+        labels = [sample['gt_label'] for sample in data_sample_list[0]]
+    else:
+        labels = [sample['gt_label'].item() for sample in data_sample_list[0]]
+
+    if args.apply_softmax:
+
+        def apply_softmax(scores):
+            return [softmax(score) for score in scores]
+
+        score_list = [apply_softmax(scores) for scores in score_list]
+
+    weighted_scores = get_weighted_score(score_list, args.coefficients)
+    if args.multi_label:
+        mean_avg_prec = mmit_mean_average_precision(
+            np.array(weighted_scores), np.stack([t.numpy() for t in labels]))
+        print(f'MMit Average Precision: {mean_avg_prec:.04f}')
+    else:
+        mean_class_acc = mean_class_accuracy(weighted_scores, labels)
+        top_1_acc, top_5_acc = top_k_accuracy(weighted_scores, labels, (1, 5))
+        print(f'Mean Class Accuracy: {mean_class_acc:.04f}')
+        print(f'Top 1 Accuracy: {top_1_acc:.04f}')
+        print(f'Top 5 Accuracy: {top_5_acc:.04f}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/report_map.py b/tools/analysis_tools/report_map.py
new file mode 100644
index 0000000000000000000000000000000000000000..49206ff84be87d9d17f04b8e898260a12e478e0f
--- /dev/null
+++ b/tools/analysis_tools/report_map.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+import mmengine
+import numpy as np
+
+from mmaction.evaluation import ActivityNetLocalization
+
+args = None
+
+
+def cuhk17_top1():
+    """Assign label for each proposal with the cuhk17 result, which is the #2
+    entry in http://activity-net.org/challenges/2017/evaluation.html."""
+    if not osp.exists('cuhk_anet17_pred.json'):
+        os.system('wget https://download.openmmlab.com/'
+                  'mmaction/localization/cuhk_anet17_pred.json')
+    proposal = mmengine.load(args.proposal)
+    results = proposal['results']
+    cuhk_pred = mmengine.load('cuhk_anet17_pred.json')['results']
+
+    def get_topk(preds, k):
+        preds.sort(key=lambda x: x['score'])
+        return preds[-k:]
+
+    for k, v in results.items():
+        action_pred = cuhk_pred[k]
+        top1 = get_topk(action_pred, 1)
+        top1_label = top1[0]['label']
+        new_value = []
+        for item in v:
+            x = dict(label=top1_label)
+            x.update(item)
+            new_value.append(x)
+        results[k] = new_value
+    proposal['results'] = results
+    mmengine.dump(proposal, args.det_output)
+
+
+cls_funcs = {'cuhk17_top1': cuhk17_top1}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Report detection mAP for'
+                                     'ActivityNet proposal file')
+    parser.add_argument('--proposal', type=str, help='proposal file')
+    parser.add_argument(
+        '--gt',
+        type=str,
+        default='data/ActivityNet/'
+        'anet_anno_val.json',
+        help='groundtruth file')
+    parser.add_argument(
+        '--cls',
+        type=str,
+        default='cuhk17_top1',
+        choices=['cuhk17_top1'],
+        help='the way to assign label for each '
+        'proposal')
+    parser.add_argument(
+        '--det-output',
+        type=str,
+        default='det_result.json',
+        help='the path to store detection results')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    global args, cls_funcs
+    args = parse_args()
+    func = cls_funcs[args.cls]
+    func()
+    anet_detection = ActivityNetLocalization(
+        args.gt,
+        args.det_output,
+        tiou_thresholds=np.linspace(0.5, 0.95, 10),
+        verbose=True)
+    mAP, average_mAP = anet_detection.evaluate()
+    print('[RESULTS] Performance on ActivityNet detection task.\n'
+          f'mAP: {mAP}\nAverage-mAP: {average_mAP}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/argparse.bash b/tools/argparse.bash
new file mode 100644
index 0000000000000000000000000000000000000000..6182e393007568f8d12b7cca1ac3a146f968339a
--- /dev/null
+++ b/tools/argparse.bash
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+# Use python's argparse module in shell scripts
+#
+# The function `argparse` parses its arguments using
+# argparse.ArgumentParser; the parser is defined in the function's
+# stdin.
+#
+# Executing ``argparse.bash`` (as opposed to sourcing it) prints a
+# script template.
+#
+# https://github.com/nhoffman/argparse-bash
+# MIT License - Copyright (c) 2015 Noah Hoffman
+#
+# The MIT License (MIT)
+#
+# Copyright (c) 2015 Noah Hoffman
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+argparse(){
+    argparser=$(mktemp 2>/dev/null || mktemp -t argparser)
+    cat > "$argparser" <<EOF
+from __future__ import print_function
+import argparse
+import os
+import sys
+
+
+class MyArgumentParser(argparse.ArgumentParser):
+    def print_help(self, file=None):
+        """Print help and exit with error"""
+        super(MyArgumentParser, self).print_help(file=file)
+        sys.exit(1)
+
+parser = MyArgumentParser(prog=os.path.basename("$0"),
+            description="""$ARGPARSE_DESCRIPTION""")
+EOF
+
+    # stdin to this function should contain the parser definition
+    cat >> "$argparser"
+
+    cat >> "$argparser" <<EOF
+args = parser.parse_args()
+for arg in [a for a in dir(args) if not a.startswith('_')]:
+    key = arg.upper()
+    value = getattr(args, arg, None)
+
+    if isinstance(value, bool) or value is None:
+        print('{0}="{1}";'.format(key, 'yes' if value else ''))
+    elif isinstance(value, list):
+        print('{0}=({1});'.format(key, ' '.join('"{0}"'.format(s) for s in value)))
+    else:
+        print('{0}="{1}";'.format(key, value))
+EOF
+
+    # Define variables corresponding to the options if the args can be
+    # parsed without errors; otherwise, print the text of the error
+    # message.
+    if python "$argparser" "$@" &> /dev/null; then
+        eval $(python "$argparser" "$@")
+        retval=0
+    else
+        python "$argparser" "$@"
+        retval=1
+    fi
+
+    rm "$argparser"
+    return $retval
+}
+
+# print a script template when this script is executed
+if [[ $0 == *argparse.bash ]]; then
+    cat <<FOO
+#!/usr/bin/env bash
+
+source \$(dirname \$0)/argparse.bash || exit 1
+argparse "\$@" <<EOF || exit 1
+parser.add_argument('infile')
+parser.add_argument('-o', '--outfile')
+
+EOF
+
+echo "INFILE: \${INFILE}"
+echo "OUTFILE: \${OUTFILE}"
+FOO
+fi
diff --git a/tools/convert/convert_recognizer.py b/tools/convert/convert_recognizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac019babbed79e5b7684a8e6f54b5f6af24d8d31
--- /dev/null
+++ b/tools/convert/convert_recognizer.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+from mmengine.config import Config
+
+
+def convert(config_path, output_config_path):
+    print('start convert')
+
+    cfg = Config.fromfile(config_path)
+    origin_dataset_type = cfg.dataset_type
+
+    # dataset
+    if origin_dataset_type != 'VideoDataset':
+        cfg.dataset_type = 'VideoDataset'
+        cfg.data_root = 'data/kinetics400/rawframes_train'
+        cfg.data_root_val = 'data/kinetics400/rawframes_val'
+        cfg.ann_file_train = \
+            'data/kinetics400/kinetics400_train_list_rawframes.txt'
+        cfg.ann_file_val = \
+            'data/kinetics400/kinetics400_val_list_rawframes.txt'
+        cfg.ann_file_test = \
+            'data/kinetics400/kinetics400_val_list_rawframes.txt'
+
+    # model
+    preprocess_cfg = cfg.img_norm_cfg
+
+    formatshape = None
+    for trans in cfg.train_pipeline:
+        if trans.type == 'FormatShape':
+            formatshape = trans.input_format
+
+    preprocess_cfg['input_format'] = formatshape
+    cfg.preprocess_cfg = preprocess_cfg
+
+    cfg.model.data_preprocessor = dict(
+        type='ActionDataPreprocessor', **dict(preprocess_cfg))
+    cfg.pop('img_norm_cfg')
+
+    if (cfg.model.test_cfg is not None) and ('average_clips'
+                                             in cfg.model.test_cfg):
+        cfg.model.cls_head.average_clips = cfg.model.test_cfg.average_clips
+        cfg.model.test_cfg.pop('average_clips')
+        if len(cfg.model.test_cfg) == 0:
+            cfg.model.test_cfg = None
+
+    # pipeline
+    pipelines = [cfg.train_pipeline, cfg.val_pipeline, cfg.test_pipeline]
+
+    if origin_dataset_type == 'VideoDataset':
+
+        for pipeline in pipelines:
+            new_pipeline = [
+                trans for trans in pipeline
+                if trans.type not in ['Normalize', 'Collect', 'ToTensor']
+            ]
+            new_pipeline.append(dict(type='PackActionInputs'))
+            pipeline.clear().extend(new_pipeline)
+
+    elif origin_dataset_type == 'RawframeDataset':
+
+        for pipeline in pipelines:
+            new_pipeline = [
+                trans for trans in pipeline if trans.type not in
+                ['RawFrameDecode', 'Normalize', 'Collect', 'ToTensor']
+            ]
+            new_pipeline.insert(0, dict(type='DecordInit'))
+            new_pipeline.insert(2, dict(type='DecordDecode'))
+            new_pipeline.append(dict(type='PackActionInputs'))
+            pipeline.clear()
+            pipeline.extend(new_pipeline)
+
+    # dataloader
+    cfg.data.train.update(
+        dict(
+            type=cfg.dataset_type,
+            ann_file=cfg.ann_file_train,
+            data_prefix=dict(video=cfg.data_root),
+            pipeline=cfg.train_pipeline))
+    cfg.train_dataloader = dict(
+        batch_size=cfg.data.videos_per_gpu,
+        num_workers=cfg.data.workers_per_gpu,
+        persistent_workers=True,
+        sampler=dict(type='DefaultSampler', shuffle=True),
+        dataset=cfg.data.train)
+
+    val_batchsize = cfg.data.val_dataloader.videos_per_gpu \
+        if 'val_dataloader' in cfg.data else cfg.data.videos_per_gpu
+    cfg.data.val.update(
+        dict(
+            type=cfg.dataset_type,
+            ann_file=cfg.ann_file_val,
+            data_prefix=dict(video=cfg.data_root_val),
+            pipeline=cfg.val_pipeline))
+    cfg.val_dataloader = dict(
+        batch_size=val_batchsize,
+        num_workers=cfg.data.workers_per_gpu,
+        persistent_workers=True,
+        sampler=dict(type='DefaultSampler', shuffle=True),
+        dataset=cfg.data.val)
+
+    test_batchsize = cfg.data.test_dataloader.videos_per_gpu \
+        if 'test_dataloader' in cfg.data else cfg.data.videos_per_gpu
+
+    cfg.data.test.update(
+        dict(
+            type=cfg.dataset_type,
+            ann_file=cfg.ann_file_test,
+            data_prefix=dict(video=cfg.data_root_val),
+            pipeline=cfg.test_pipeline))
+    cfg.test_dataloader = dict(
+        batch_size=test_batchsize,
+        num_workers=cfg.data.workers_per_gpu,
+        persistent_workers=True,
+        sampler=dict(type='DefaultSampler', shuffle=True),
+        dataset=cfg.data.test)
+
+    cfg.pop('data')
+
+    # eval
+    cfg.val_evaluator = dict(type='AccMetric')
+    cfg.test_evaluator = cfg.val_evaluator
+
+    cfg.val_cfg = dict(interval=cfg.evaluation.interval)
+    cfg.test_cfg = dict()
+    cfg.pop('evaluation')
+    # optimizer
+    optimizer_wrapper = dict(optimizer=dict())
+    for k, v in cfg.optimizer.items():
+        if k not in ['paramwise_cfg', 'constructor']:
+            optimizer_wrapper['optimizer'].update({k: v})
+        else:
+            optimizer_wrapper.update({k: v})
+    for k, v in cfg.optimizer_config.items():
+        if k == 'grad_clip':
+            k = 'clip_grad'
+        optimizer_wrapper.update({k: v})
+    cfg.optimizer_wrapper = optimizer_wrapper
+
+    cfg.pop('optimizer')
+    cfg.pop('optimizer_config')
+    # train_cfg
+    cfg.train_cfg = dict(by_epoch=True, max_epochs=cfg.total_epochs)
+    cfg.pop('total_epochs')
+    # schedule
+    cfg.param_scheduler = []
+    warmup_epoch = 0
+    if 'warmup' in cfg.lr_config:
+        warmup_ratio = 0.1 \
+            if 'warmup_ratio' not in cfg.lr_config \
+            else cfg.lr_config.warmup_ratio
+
+        warmup_epoch = cfg.lr_config.warmup_iters
+        cfg.param_scheduler.append(
+            dict(
+                type='LinearLR',
+                bengin=0,
+                start_factor=warmup_ratio,
+                end=cfg.lr_config.warmup_iters,
+                by_epoch=cfg.lr_config.warmup_by_epoch))
+    if cfg.lr_config.policy == 'step':
+        cfg.param_scheduler.append(
+            dict(
+                type='MultiStepLR',
+                milestones=cfg.lr_config.step,
+                by_epoch=cfg.train_cfg.by_epoch,
+                begin=0,
+                end=cfg.train_cfg.max_epochs,
+                gamma=0.1
+                if 'gamma' not in cfg.lr_config else cfg.lr_config.gamma))
+    elif cfg.lr_config.policy == 'CosineAnnealing':
+        cfg.param_scheduler.append(
+            dict(
+                type='CosineAnnealingLR',
+                eta_min=cfg.lr_config.min_lr,
+                by_epoch=cfg.train_cfg.by_epoch,
+                begin=warmup_epoch,
+                end=cfg.train_cfg.max_epochs,
+                T_max=cfg.train_cfg.max_epochs - warmup_epoch))
+    else:
+        raise ValueError(f'Not support convert {cfg.lr_config.policy}')
+    cfg.pop('lr_config')
+
+    # runtime
+    cfg.default_scope = 'mmaction'
+
+    cfg.default_hooks = dict(
+        runtime_info=dict(type='RuntimeInfoHook'),
+        timer=dict(type='IterTimerHook'),
+        logger=dict(type='LoggerHook', interval=20),
+        param_scheduler=dict(type='ParamSchedulerHook'),
+        checkpoint=dict(type='CheckpointHook', **cfg.checkpoint_config),
+        sampler_seed=dict(type='DistSamplerSeedHook'),
+    )
+
+    cfg.env_cfg = dict(
+        cudnn_benchmark=False,
+        mp_cfg=dict(
+            mp_start_method=cfg.mp_start_method,
+            opencv_num_threads=cfg.opencv_num_threads),
+        dist_cfg=dict(**cfg.dist_params),
+    )
+
+    cfg.log_level = 'INFO'
+    cfg.load_from = None
+    cfg.resume = False
+
+    cfg.pop('workflow')
+    cfg.pop('mp_start_method')
+    cfg.pop('opencv_num_threads')
+    cfg.pop('log_config')
+    cfg.pop('dist_params')
+    cfg.pop('checkpoint_config')
+    cfg.pop('work_dir')
+    cfg.dump(output_config_path)
+
+    print('Successful')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Convert an action recognizer config file \
+        from OpenMMLAb framework v1.0 to v2.0')
+    parser.add_argument('config', help='The config file path')
+    parser.add_argument('output_config', help='The config file path')
+    args = parser.parse_args()
+    convert(args.config, args.output_config)
diff --git a/tools/convert/reparameterize_model.py b/tools/convert/reparameterize_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..beceaa6479fa4975916e2f2e6b76eac360c4d38a
--- /dev/null
+++ b/tools/convert/reparameterize_model.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from pathlib import Path
+
+import torch
+
+from mmaction.apis import init_recognizer
+from mmaction.models.recognizers import BaseRecognizer
+
+
+def convert_recoginzer_to_deploy(model, checkpoint, save_path):
+    print('Converting...')
+    assert hasattr(model, 'backbone') and \
+        hasattr(model.backbone, 'switch_to_deploy'), \
+        '`model.backbone` must has method of "switch_to_deploy".' \
+        f' But {model.backbone.__class__} does not have.'
+
+    model.backbone.switch_to_deploy()
+    checkpoint['state_dict'] = model.state_dict()
+    torch.save(checkpoint, save_path)
+
+    print('Done! Save at path "{}"'.format(save_path))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert the parameters of the repvgg block '
+        'from training mode to deployment mode.')
+    parser.add_argument(
+        'config_path',
+        help='The path to the configuration file of the network '
+        'containing the repvgg block.')
+    parser.add_argument(
+        'checkpoint_path',
+        help='The path to the checkpoint file corresponding to the model.')
+    parser.add_argument(
+        'save_path',
+        help='The path where the converted checkpoint file is stored.')
+    args = parser.parse_args()
+
+    save_path = Path(args.save_path)
+    if save_path.suffix != '.pth' and save_path.suffix != '.tar':
+        print('The path should contain the name of the pth format file.')
+        exit()
+    save_path.parent.mkdir(parents=True, exist_ok=True)
+
+    model = init_recognizer(
+        args.config_path, checkpoint=args.checkpoint_path, device='cpu')
+    assert isinstance(model, BaseRecognizer), \
+        '`model` must be a `mmpretrain.classifiers.ImageClassifier` instance.'
+
+    checkpoint = torch.load(args.checkpoint_path)
+    convert_recoginzer_to_deploy(model, checkpoint, args.save_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/data/activitynet/README.md b/tools/data/activitynet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..aafba47f4213542ec12292d82b59f0bc50953c0e
--- /dev/null
+++ b/tools/data/activitynet/README.md
@@ -0,0 +1,187 @@
+# Preparing ActivityNet
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{Heilbron2015ActivityNetAL,
+  title={ActivityNet: A large-scale video benchmark for human activity understanding},
+  author={Fabian Caba Heilbron and Victor Escorcia and Bernard Ghanem and Juan Carlos Niebles},
+  journal={2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2015},
+  pages={961-970}
+}
+```
+
+For basic dataset information, please refer to the official [website](http://activity-net.org/).
+For action detection, you can either use the ActivityNet rescaled feature provided in this [repo](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation) or extract feature with mmaction2 (which has better performance).
+We release both pipeline.
+Before we start, please make sure that current working directory is `$MMACTION2/tools/data/activitynet/`.
+
+## Option 1: Use the ActivityNet rescaled feature provided in this [repo](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation)
+
+### Step 1. Download Annotations
+
+First of all, you can run the following script to download annotation files.
+
+```shell
+bash download_feature_annotations.sh
+```
+
+### Step 2. Prepare Videos Features
+
+Then, you can run the following script to download activitynet features.
+
+```shell
+bash download_features.sh
+```
+
+### Step 3. Process Annotation Files
+
+Next, you can run the following script to process the downloaded annotation files for training and testing.
+It first merges the two annotation files together and then separates the annoations by `train`, `val` and `test`.
+
+```shell
+python process_annotations.py
+```
+
+## Option 2: Extract ActivityNet feature using MMAction2 with all videos provided in official [website](http://activity-net.org/)
+
+### Step 1. Download Annotations
+
+First of all, you can run the following script to download annotation files.
+
+```shell
+bash download_annotations.sh
+```
+
+### Step 2. Prepare Videos
+
+Then, you can run the following script to prepare videos.
+The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time.
+
+```shell
+bash download_videos.sh
+```
+
+Since some videos in the ActivityNet dataset might be no longer available on YouTube, official [website](http://activity-net.org/) has made the full dataset available on Google and Baidu drives.
+To accommodate missing data requests, you can fill in this [request form](https://docs.google.com/forms/d/e/1FAIpQLSeKaFq9ZfcmZ7W0B0PbEhfbTHY41GeEgwsa7WobJgGUhn4DTQ/viewform) provided in official [download page](http://activity-net.org/download.html) to have a 7-day-access to download the videos from the drive folders.
+
+We also provide download steps for annotations from [BSN repo](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation)
+
+```shell
+bash download_bsn_videos.sh
+```
+
+For this case, the downloading scripts update the annotation file after downloading to make sure every video in it exists.
+
+### Step 3. Extract RGB and Flow
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+Use following scripts to extract both RGB and Flow.
+
+```shell
+bash extract_frames.sh
+```
+
+The command above can generate images with new short edge 256. If you want to generate images with short edge 320 (320p), or with fix size 340x256, you can change the args `--new-short 256` to `--new-short 320` or `--new-width 340 --new-height 256`.
+More details can be found in [prepare dataset](/docs/en/user_guides/prepare_dataset.md)
+
+### Step 4. Generate File List for ActivityNet Finetuning
+
+With extracted frames, you can generate video-level or clip-level lists of rawframes, which can be used for ActivityNet Finetuning.
+
+```shell
+python generate_rawframes_filelist.py
+```
+
+### Step 5. Finetune TSN models on ActivityNet
+
+You can use ActivityNet configs in `configs/recognition/tsn` to finetune TSN models on ActivityNet.
+You need to use Kinetics models for pretraining.
+Both RGB models and Flow models are supported.
+
+### Step 6. Extract ActivityNet Feature with finetuned ckpts
+
+After finetuning TSN on ActivityNet, you can use it to extract both RGB and Flow feature.
+
+```shell
+python ../../misc/clip_feature_extraction.py tsn_extract_rgb_feat_config.py \
+  /path/to/rgb_checkpoint.pth ../../../data/ActivityNet/rgb_tarin_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_train_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+
+python ../../misc/clip_feature_extraction.py tsn_extract_rgb_feat_config.py \
+  path/to/rgb_checkpoint.pth ../../../data/ActivityNet/rgb_val_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_val_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+
+python ../../misc/clip_feature_extraction.py tsn_extract_flow_feat_config.py \
+  /path/to/flow_checkpoint.pth ../../../data/ActivityNet/flow_tarin_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_train_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+
+python ../../misc/clip_feature_extraction.py tsn_extract_flow_feat_config.py \
+  /path/to/flow_checkpoint.pth ../../../data/ActivityNet/flow_val_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_val_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+```
+
+After feature extraction, you can use our post processing scripts to concat RGB and Flow feature, generate the `100-t X 400-d` feature for Action Detection.
+
+```shell
+python activitynet_feature_postprocessing.py --rgb ../../../data/ActivityNet/rgb_feat --flow ../../../data/ActivityNet/flow_feat --dest ../../../data/ActivityNet/mmaction_feat
+```
+
+## Final Step. Check Directory Structure
+
+After the whole data pipeline for ActivityNet preparation,
+you will get the features, videos, frames and annotation files.
+
+In the context of the whole project (for ActivityNet only), the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── ActivityNet
+
+(if Option 1 used)
+│   │   ├── anet_anno_{train,val,test,full}.json
+│   │   ├── anet_anno_action.json
+│   │   ├── video_info_new.csv
+│   │   ├── activitynet_feature_cuhk
+│   │   │   ├── csv_mean_100
+│   │   │   │   ├── v___c8enCfzqw.csv
+│   │   │   │   ├── v___dXUJsj3yo.csv
+│   │   │   |   ├── ..
+
+(if Option 2 used)
+│   │   ├── anet_train_video.txt
+│   │   ├── anet_val_video.txt
+│   │   ├── anet_train_clip.txt
+│   │   ├── anet_val_clip.txt
+│   │   ├── activity_net.v1-3.min.json
+│   │   ├── mmaction_feat
+│   │   │   ├── v___c8enCfzqw.csv
+│   │   │   ├── v___dXUJsj3yo.csv
+│   │   │   ├── ..
+│   │   ├── rawframes
+│   │   │   ├── v___c8enCfzqw
+│   │   │   │   ├── img_00000.jpg
+│   │   │   │   ├── flow_x_00000.jpg
+│   │   │   │   ├── flow_y_00000.jpg
+│   │   │   │   ├── ..
+│   │   │   ├── ..
+
+```
+
+For training and evaluating on ActivityNet, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/activitynet/README_zh-CN.md b/tools/data/activitynet/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c852febcb6f7786a3816cf6f11e51bd785dcd0d
--- /dev/null
+++ b/tools/data/activitynet/README_zh-CN.md
@@ -0,0 +1,185 @@
+# 准备 ActivityNet
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{Heilbron2015ActivityNetAL,
+  title={ActivityNet: A large-scale video benchmark for human activity understanding},
+  author={Fabian Caba Heilbron and Victor Escorcia and Bernard Ghanem and Juan Carlos Niebles},
+  journal={2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2015},
+  pages={961-970}
+}
+```
+
+用户可参考该数据集的 [官网](http://activity-net.org/)，以获取数据集相关的基本信息。
+对于时序动作检测任务，用户可以使用这个 [代码库](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation) 提供的缩放过（rescaled）的 ActivityNet 特征，
+或者使用 MMAction2 进行特征提取（这将具有更高的精度）。MMAction2 同时提供了以上所述的两种数据使用流程。
+在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/activitynet/`。
+
+## 选项 1：用户可以使用这个 [代码库](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation) 提供的特征
+
+### 步骤 1. 下载标注文件
+
+首先，用户可以使用以下命令下载标注文件。
+
+```shell
+bash download_feature_annotations.sh
+```
+
+### 步骤 2. 准备视频特征
+
+之后，用户可以使用以下命令下载 ActivityNet 特征。
+
+```shell
+bash download_features.sh
+```
+
+### 步骤 3. 处理标注文件
+
+之后，用户可以使用以下命令处理下载的标注文件，以便于训练和测试。
+该脚本会首先合并两个标注文件，然后再将其分为 `train`, `val` 和 `test` 三个部分。
+
+```shell
+python process_annotations.py
+```
+
+## 选项 2：使用 MMAction2 对 [官网](http://activity-net.org/) 提供的视频进行特征抽取
+
+### 步骤 1. 下载标注文件
+
+首先，用户可以使用以下命令下载标注文件。
+
+```shell
+bash download_annotations.sh
+```
+
+### 步骤 2. 准备视频
+
+之后，用户可以使用以下脚本准备视频数据。
+该代码参考自 [官方爬虫](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)，该过程将会耗费较多时间。
+
+```shell
+bash download_videos.sh
+```
+
+由于 ActivityNet 数据集中的一些视频已经在 YouTube 失效，[官网](http://activity-net.org/) 在谷歌网盘和百度网盘提供了完整的数据集数据。
+如果用户想要获取失效的数据集，则需要填写 [下载页面](http://activity-net.org/download.html) 中提供的 [需求表格](https://docs.google.com/forms/d/e/1FAIpQLSeKaFq9ZfcmZ7W0B0PbEhfbTHY41GeEgwsa7WobJgGUhn4DTQ/viewform) 以获取 7 天的下载权限。
+
+MMAction2 同时也提供了 [BSN 代码库](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation) 的标注文件的下载步骤。
+
+```shell
+bash download_bsn_videos.sh
+```
+
+对于这种情况，该下载脚本将在下载后更新此标注文件，以确保每个视频都存在。
+
+### 步骤 3. 抽取 RGB 帧和光流
+
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+可使用以下命令抽取视频帧和光流。
+
+```shell
+bash extract_frames.sh
+```
+
+以上脚本将会生成短边 256 分辨率的视频。如果用户想生成短边 320 分辨率的视频（即 320p），或者 340x256 的固定分辨率，用户可以通过改变参数由 `--new-short 256` 至 `--new-short 320`，或者 `--new-width 340 --new-height 256` 进行设置
+更多细节可参考 [数据准备指南](/docs/zh_cn/user_guides/prepare_dataset.md)
+
+### 步骤 4. 生成用于 ActivityNet 微调的文件列表
+
+根据抽取的帧，用户可以生成视频级别（video-level）或者片段级别（clip-level）的文件列表，其可用于微调 ActivityNet。
+
+```shell
+python generate_rawframes_filelist.py
+```
+
+### 步骤 5. 在 ActivityNet 上微调 TSN 模型
+
+用户可使用 `configs/recognition/tsn` 目录中的 ActivityNet 配置文件进行 TSN 模型微调。
+用户需要使用 Kinetics 相关模型（同时支持 RGB 模型与光流模型）进行预训练。
+
+### 步骤 6. 使用预训练模型进行 ActivityNet 特征抽取
+
+在 ActivityNet 上微调 TSN 模型之后，用户可以使用该模型进行 RGB 特征和光流特征的提取。
+
+```shell
+python ../../misc/clip_feature_extraction.py tsn_extract_rgb_feat_config.py \
+  /path/to/rgb_checkpoint.pth ../../../data/ActivityNet/rgb_tarin_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_train_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+
+python ../../misc/clip_feature_extraction.py tsn_extract_rgb_feat_config.py \
+  path/to/rgb_checkpoint.pth ../../../data/ActivityNet/rgb_val_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_val_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+
+python ../../misc/clip_feature_extraction.py tsn_extract_flow_feat_config.py \
+  /path/to/flow_checkpoint.pth ../../../data/ActivityNet/flow_tarin_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_train_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+
+python ../../misc/clip_feature_extraction.py tsn_extract_flow_feat_config.py \
+  /path/to/flow_checkpoint.pth ../../../data/ActivityNet/flow_val_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_val_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+```
+
+在提取完特征后，用户可以使用后处理脚本整合 RGB 特征和光流特征，生成 `100-t X 400-d` 维度的特征用于时序动作检测。
+
+```shell
+python activitynet_feature_postprocessing.py --rgb ../../../data/ActivityNet/rgb_feat --flow ../../../data/ActivityNet/flow_feat --dest ../../../data/ActivityNet/mmaction_feat
+```
+
+## 最后一步：检查文件夹结构
+
+在完成所有 ActivityNet 数据集准备流程后，用户可以获得对应的特征文件，RGB + 光流文件，视频文件以及标注文件。
+
+在整个 MMAction2 文件夹下，ActivityNet 的文件结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── ActivityNet
+
+(若根据选项 1 进行数据处理)
+│   │   ├── anet_anno_{train,val,test,full}.json
+│   │   ├── anet_anno_action.json
+│   │   ├── video_info_new.csv
+│   │   ├── activitynet_feature_cuhk
+│   │   │   ├── csv_mean_100
+│   │   │   │   ├── v___c8enCfzqw.csv
+│   │   │   │   ├── v___dXUJsj3yo.csv
+│   │   │   |   ├── ..
+
+(若根据选项 2 进行数据处理)
+│   │   ├── anet_train_video.txt
+│   │   ├── anet_val_video.txt
+│   │   ├── anet_train_clip.txt
+│   │   ├── anet_val_clip.txt
+│   │   ├── activity_net.v1-3.min.json
+│   │   ├── mmaction_feat
+│   │   │   ├── v___c8enCfzqw.csv
+│   │   │   ├── v___dXUJsj3yo.csv
+│   │   │   ├── ..
+│   │   ├── rawframes
+│   │   │   ├── v___c8enCfzqw
+│   │   │   │   ├── img_00000.jpg
+│   │   │   │   ├── flow_x_00000.jpg
+│   │   │   │   ├── flow_y_00000.jpg
+│   │   │   │   ├── ..
+│   │   │   ├── ..
+
+```
+
+关于对 ActivityNet 进行训练和验证，可以参考 [训练教程](/docs/zh_cn/user_guides/train_test.md).
diff --git a/tools/data/activitynet/action_name.csv b/tools/data/activitynet/action_name.csv
new file mode 100644
index 0000000000000000000000000000000000000000..ff639a9ff7e3e157ce8440b13f58d23d159b6c10
--- /dev/null
+++ b/tools/data/activitynet/action_name.csv
@@ -0,0 +1,201 @@
+action
+Applying sunscreen
+Arm wrestling
+Assembling bicycle
+BMX
+Baking cookies
+Baton twirling
+Beach soccer
+Beer pong
+Blow-drying hair
+Blowing leaves
+Playing ten pins
+Braiding hair
+Building sandcastles
+Bullfighting
+Calf roping
+Camel ride
+Canoeing
+Capoeira
+Carving jack-o-lanterns
+Changing car wheel
+Cleaning sink
+Clipping cat claws
+Croquet
+Curling
+Cutting the grass
+Decorating the Christmas tree
+Disc dog
+Doing a powerbomb
+Doing crunches
+Drum corps
+Elliptical trainer
+Doing fencing
+Fixing the roof
+Fun sliding down
+Futsal
+Gargling mouthwash
+Grooming dog
+Hand car wash
+Hanging wallpaper
+Having an ice cream
+Hitting a pinata
+Hula hoop
+Hurling
+Ice fishing
+Installing carpet
+Kite flying
+Kneeling
+Knitting
+Laying tile
+Longboarding
+Making a cake
+Making a lemonade
+Making an omelette
+Mooping floor
+Painting fence
+Painting furniture
+Peeling potatoes
+Plastering
+Playing beach volleyball
+Playing blackjack
+Playing congas
+Playing drums
+Playing ice hockey
+Playing pool
+Playing rubik cube
+Powerbocking
+Putting in contact lenses
+Putting on shoes
+Rafting
+Raking leaves
+Removing ice from car
+Riding bumper cars
+River tubing
+Rock-paper-scissors
+Rollerblading
+Roof shingle removal
+Rope skipping
+Running a marathon
+Scuba diving
+Sharpening knives
+Shuffleboard
+Skiing
+Slacklining
+Snow tubing
+Snowboarding
+Spread mulch
+Sumo
+Surfing
+Swimming
+Swinging at the playground
+Table soccer
+Throwing darts
+Trimming branches or hedges
+Tug of war
+Using the monkey bar
+Using the rowing machine
+Wakeboarding
+Waterskiing
+Waxing skis
+Welding
+Drinking coffee
+Zumba
+Doing kickboxing
+Doing karate
+Tango
+Putting on makeup
+High jump
+Playing bagpipes
+Cheerleading
+Wrapping presents
+Cricket
+Clean and jerk
+Preparing pasta
+Bathing dog
+Discus throw
+Playing field hockey
+Grooming horse
+Preparing salad
+Playing harmonica
+Playing saxophone
+Chopping wood
+Washing face
+Using the pommel horse
+Javelin throw
+Spinning
+Ping-pong
+Making a sandwich
+Brushing hair
+Playing guitarra
+Doing step aerobics
+Drinking beer
+Playing polo
+Snatch
+Paintball
+Long jump
+Cleaning windows
+Brushing teeth
+Playing flauta
+Tennis serve with ball bouncing
+Bungee jumping
+Triple jump
+Horseback riding
+Layup drill in basketball
+Vacuuming floor
+Cleaning shoes
+Doing nails
+Shot put
+Fixing bicycle
+Washing hands
+Ironing clothes
+Using the balance beam
+Shoveling snow
+Tumbling
+Using parallel bars
+Getting a tattoo
+Rock climbing
+Smoking hookah
+Shaving
+Getting a piercing
+Springboard diving
+Playing squash
+Playing piano
+Dodgeball
+Smoking a cigarette
+Sailing
+Getting a haircut
+Playing lacrosse
+Cumbia
+Tai chi
+Painting
+Mowing the lawn
+Shaving legs
+Walking the dog
+Hammer throw
+Skateboarding
+Polishing shoes
+Ballet
+Hand washing clothes
+Plataform diving
+Playing violin
+Breakdancing
+Windsurfing
+Hopscotch
+Doing motocross
+Mixing drinks
+Starting a campfire
+Belly dance
+Removing curlers
+Archery
+Volleyball
+Playing water polo
+Playing racquetball
+Kayaking
+Polishing forniture
+Playing kickball
+Using uneven bars
+Washing dishes
+Pole vault
+Playing accordion
+Playing badminton
diff --git a/tools/data/activitynet/activitynet_feature_postprocessing.py b/tools/data/activitynet/activitynet_feature_postprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..789d8583c3e5817076e433a9c18b22781e4ca41c
--- /dev/null
+++ b/tools/data/activitynet/activitynet_feature_postprocessing.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import multiprocessing
+import os
+import os.path as osp
+
+import numpy as np
+import scipy.interpolate
+from mmengine import dump, load
+
+args = None
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='ANet Feature Prepare')
+    parser.add_argument('--rgb', default='', help='rgb feature root')
+    parser.add_argument('--flow', default='', help='flow feature root')
+    parser.add_argument('--dest', default='', help='dest root')
+    parser.add_argument('--output-format', default='csv')
+    args = parser.parse_args()
+    return args
+
+
+def pool_feature(data, num_proposals=100, num_sample_bins=3, pool_type='mean'):
+    """Pool features with arbitrary temporal length.
+
+    Args:
+        data (list[np.ndarray] | np.ndarray): Features of an untrimmed video,
+            with arbitrary temporal length.
+        num_proposals (int): The temporal dim of pooled feature. Default: 100.
+        num_sample_bins (int): How many points to sample to get the feature
+            vector at one timestamp. Default: 3.
+        pool_type (str): Type of pooling to pool features. Choices are
+            ['mean', 'max']. Default: 'mean'.
+
+    Returns:
+        np.ndarray: The pooled feature with shape num_proposals x feature_dim.
+    """
+    if len(data) == 1:
+        return np.concatenate([data] * num_proposals)
+    x_range = list(range(len(data)))
+    f = scipy.interpolate.interp1d(x_range, data, axis=0)
+    eps = 1e-4
+    start, end = eps, len(data) - 1 - eps
+    anchor_size = (end - start) / num_proposals
+    ptr = start
+    feature = []
+    for _ in range(num_proposals):
+        x_new = [
+            ptr + i / num_sample_bins * anchor_size
+            for i in range(num_sample_bins)
+        ]
+        y_new = f(x_new)
+        if pool_type == 'mean':
+            y_new = np.mean(y_new, axis=0)
+        elif pool_type == 'max':
+            y_new = np.max(y_new, axis=0)
+        else:
+            raise NotImplementedError('Unsupported pool type')
+        feature.append(y_new)
+        ptr += anchor_size
+    feature = np.stack(feature)
+    return feature
+
+
+def merge_feat(name):
+    # concatenate rgb feat and flow feat for a single sample
+    rgb_feat = load(osp.join(args.rgb, name))
+    flow_feat = load(osp.join(args.flow, name))
+    rgb_feat = pool_feature(rgb_feat)
+    flow_feat = pool_feature(flow_feat)
+    feat = np.concatenate([rgb_feat, flow_feat], axis=-1)
+    if not osp.exists(args.dest):
+        os.system(f'mkdir -p {args.dest}')
+    if args.output_format == 'pkl':
+        dump(feat, osp.join(args.dest, name))
+    elif args.output_format == 'csv':
+        feat = feat.tolist()
+        lines = []
+        line0 = ','.join([f'f{i}' for i in range(400)])
+        lines.append(line0)
+        for line in feat:
+            lines.append(','.join([f'{x:.4f}' for x in line]))
+        with open(osp.join(args.dest, name.replace('.pkl', '.csv')), 'w') as f:
+            f.write('\n'.join(lines))
+
+
+def main():
+    global args
+    args = parse_args()
+    rgb_feat = [file for file in os.listdir(args.rgb) if file.endswith('.pkl')]
+    flow_feat = [
+        file for file in os.listdir(args.flow) if file.endswith('.pkl')
+    ]
+    assert set(rgb_feat) == set(flow_feat)
+    # for feat in rgb_feat:
+    #     merge_feat(feat)
+    pool = multiprocessing.Pool(32)
+    pool.map(merge_feat, rgb_feat)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/data/activitynet/convert_proposal_format.py b/tools/data/activitynet/convert_proposal_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6f69fe66531444e7163c7aa15ecdedd5b39e8da
--- /dev/null
+++ b/tools/data/activitynet/convert_proposal_format.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This file converts the output proposal file of proposal generator (BSN, BMN)
+into the input proposal file of action classifier (Currently supports SSN and
+P-GCN, not including TSN, I3D etc.)."""
+import argparse
+
+import mmengine
+import numpy as np
+
+from mmaction.evaluation import pairwise_temporal_iou
+
+
+def load_annotations(ann_file):
+    """Load the annotation according to ann_file into video_infos."""
+    video_infos = []
+    anno_database = mmengine.load(ann_file)
+    for video_name in anno_database:
+        video_info = anno_database[video_name]
+        video_info['video_name'] = video_name
+        video_infos.append(video_info)
+    return video_infos
+
+
+def import_ground_truth(video_infos, activity_index):
+    """Read ground truth data from video_infos."""
+    ground_truth = {}
+    for video_info in video_infos:
+        video_id = video_info['video_name'][2:]
+        this_video_ground_truths = []
+        for ann in video_info['annotations']:
+            t_start, t_end = ann['segment']
+            label = activity_index[ann['label']]
+            this_video_ground_truths.append([t_start, t_end, label])
+        ground_truth[video_id] = np.array(this_video_ground_truths)
+    return ground_truth
+
+
+def import_proposals(result_dict):
+    """Read predictions from result dict."""
+    proposals = {}
+    num_proposals = 0
+    for video_id in result_dict:
+        result = result_dict[video_id]
+        this_video_proposals = []
+        for proposal in result:
+            t_start, t_end = proposal['segment']
+            score = proposal['score']
+            this_video_proposals.append([t_start, t_end, score])
+            num_proposals += 1
+        proposals[video_id] = np.array(this_video_proposals)
+    return proposals, num_proposals
+
+
+def dump_formatted_proposal(video_idx, video_id, num_frames, fps, gts,
+                            proposals, tiou, t_overlap_self,
+                            formatted_proposal_file):
+    """dump the formatted proposal file, which is the input proposal file of
+    action classifier (e.g: SSN).
+
+    Args:
+        video_idx (int): Index of video.
+        video_id (str): ID of video.
+        num_frames (int): Total frames of the video.
+        fps (float): Fps of the video.
+        gts (np.ndarray[float]): t_start, t_end and label of groundtruths.
+        proposals (np.ndarray[float]): t_start, t_end and score of proposals.
+        tiou (np.ndarray[float]): 2-dim array with IoU ratio.
+        t_overlap_self (np.ndarray[float]): 2-dim array with overlap_self
+            (union / self_len) ratio.
+        formatted_proposal_file (open file object): Open file object of
+            formatted_proposal_file.
+    """
+
+    formatted_proposal_file.write(
+        f'#{video_idx}\n{video_id}\n{num_frames}\n{fps}\n{gts.shape[0]}\n')
+    for gt in gts:
+        formatted_proposal_file.write(f'{int(gt[2])} {gt[0]} {gt[1]}\n')
+    formatted_proposal_file.write(f'{proposals.shape[0]}\n')
+
+    best_iou = np.amax(tiou, axis=0)
+    best_iou_index = np.argmax(tiou, axis=0)
+    best_overlap = np.amax(t_overlap_self, axis=0)
+    best_overlap_index = np.argmax(t_overlap_self, axis=0)
+
+    for i in range(proposals.shape[0]):
+        index_iou = best_iou_index[i]
+        index_overlap = best_overlap_index[i]
+        label_iou = gts[index_iou][2]
+        label_overlap = gts[index_overlap][2]
+        if label_iou != label_overlap:
+            label = label_iou if label_iou != 0 else label_overlap
+        else:
+            label = label_iou
+        if best_iou[i] == 0 and best_overlap[i] == 0:
+            formatted_proposal_file.write(
+                f'0 0 0 {proposals[i][0]} {proposals[i][1]}\n')
+        else:
+            formatted_proposal_file.write(
+                f'{int(label)} {best_iou[i]} {best_overlap[i]} '
+                f'{proposals[i][0]} {proposals[i][1]}\n')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='convert proposal format')
+    parser.add_argument(
+        '--ann-file',
+        type=str,
+        default='../../../data/ActivityNet/anet_anno_val.json',
+        help='name of annotation file')
+    parser.add_argument(
+        '--activity-index-file',
+        type=str,
+        default='../../../data/ActivityNet/anet_activity_indexes_val.txt',
+        help='name of activity index file')
+    parser.add_argument(
+        '--proposal-file',
+        type=str,
+        default='../../../results.json',
+        help='name of proposal file, which is the'
+        'output of proposal generator (BMN)')
+    parser.add_argument(
+        '--formatted-proposal-file',
+        type=str,
+        default='../../../anet_val_formatted_proposal.txt',
+        help='name of formatted proposal file, which is the'
+        'input of action classifier (SSN)')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    formatted_proposal_file = open(args.formatted_proposal_file, 'w')
+
+    # The activity index file is constructed according to
+    # 'https://github.com/activitynet/ActivityNet/blob/master/Evaluation/eval_classification.py'
+    activity_index, class_idx = {}, 0
+    for line in open(args.activity_index_file).readlines():
+        activity_index[line.strip()] = class_idx
+        class_idx += 1
+
+    video_infos = load_annotations(args.ann_file)
+    ground_truth = import_ground_truth(video_infos, activity_index)
+    proposal, num_proposals = import_proposals(
+        mmengine.load(args.proposal_file)['results'])
+    video_idx = 0
+
+    for video_info in video_infos:
+        video_id = video_info['video_name'][2:]
+        num_frames = video_info['duration_frame']
+        fps = video_info['fps']
+        tiou, t_overlap = pairwise_temporal_iou(
+            proposal[video_id][:, :2].astype(float),
+            ground_truth[video_id][:, :2].astype(float),
+            calculate_overlap_self=True)
+
+        dump_formatted_proposal(video_idx, video_id, num_frames, fps,
+                                ground_truth[video_id], proposal[video_id],
+                                tiou, t_overlap, formatted_proposal_file)
+        video_idx += 1
+    formatted_proposal_file.close()
diff --git a/tools/data/activitynet/download.py b/tools/data/activitynet/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e5ea866256e622742857d8979780e9bee2584f9
--- /dev/null
+++ b/tools/data/activitynet/download.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This scripts is copied from
+# https://github.com/activitynet/ActivityNet/blob/master/Crawler/Kinetics/download.py  # noqa: E501
+# The code is licensed under the MIT licence.
+import argparse
+import os
+import ssl
+import subprocess
+
+import mmengine
+from joblib import Parallel, delayed
+
+ssl._create_default_https_context = ssl._create_unverified_context
+data_file = '../../../data/ActivityNet'
+output_dir = f'{data_file}/videos'
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='ActivityNet downloader')
+    parser.add_argument(
+        '--bsn',
+        action='store_true',
+        help='download for BSN annotation or official one')
+    args = parser.parse_args()
+    return args
+
+
+def download_clip(video_identifier,
+                  output_filename,
+                  num_attempts=5,
+                  url_base='https://www.youtube.com/watch?v='):
+    """Download a video from youtube if exists and is not blocked.
+    arguments:
+    ---------
+    video_identifier: str
+        Unique YouTube video identifier (11 characters)
+    output_filename: str
+        File path where the video will be stored.
+    """
+    # Defensive argument checking.
+    assert isinstance(video_identifier, str), 'video_identifier must be string'
+    assert isinstance(output_filename, str), 'output_filename must be string'
+    assert len(video_identifier) == 11, 'video_identifier must have length 11'
+
+    status = False
+
+    if not os.path.exists(output_filename):
+        command = [
+            'youtube-dl', '--quiet', '--no-warnings', '--no-check-certificate',
+            '-f', 'mp4', '-o',
+            '"%s"' % output_filename,
+            '"%s"' % (url_base + video_identifier)
+        ]
+        command = ' '.join(command)
+        print(command)
+        attempts = 0
+        while True:
+            try:
+                subprocess.check_output(
+                    command, shell=True, stderr=subprocess.STDOUT)
+            except subprocess.CalledProcessError:
+                attempts += 1
+                if attempts == num_attempts:
+                    return status, 'Fail'
+            else:
+                break
+    # Check if the video was successfully saved.
+    status = os.path.exists(output_filename)
+    return status, 'Downloaded'
+
+
+def download_clip_wrapper(youtube_id, output_dir):
+    """Wrapper for parallel processing purposes."""
+    # we do this to align with names in annotations
+    output_filename = os.path.join(output_dir, 'v_' + youtube_id + '.mp4')
+    if os.path.exists(output_filename):
+        status = tuple(['v_' + youtube_id, True, 'Exists'])
+        return status
+
+    downloaded, log = download_clip(youtube_id, output_filename)
+    status = tuple(['v_' + youtube_id, downloaded, log])
+    return status
+
+
+def parse_activitynet_annotations(input_csv, is_bsn_case=False):
+    """Returns a list of YoutubeID.
+    arguments:
+    ---------
+    input_csv: str
+        Path to CSV file containing the following columns:
+          'video,numFrame,seconds,fps,rfps,subset,featureFrame'
+    returns:
+    -------
+    youtube_ids: list
+        List of all YoutubeIDs in ActivityNet.
+
+    """
+    if is_bsn_case:
+        lines = open(input_csv).readlines()
+        lines = lines[1:]
+        # YoutubeIDs do not have prefix `v_`
+        youtube_ids = [x.split(',')[0][2:] for x in lines]
+    else:
+        data = mmengine.load(anno_file)['database']
+        youtube_ids = list(data.keys())
+
+    return youtube_ids
+
+
+def main(input_csv, output_dir, anno_file, num_jobs=24, is_bsn_case=False):
+    # Reading and parsing ActivityNet.
+    youtube_ids = parse_activitynet_annotations(input_csv, is_bsn_case)
+
+    # Creates folders where videos will be saved later.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Download all clips.
+    if num_jobs == 1:
+        status_list = []
+        for index in youtube_ids:
+            status_list.append(download_clip_wrapper(index, output_dir))
+    else:
+        status_list = Parallel(n_jobs=num_jobs)(
+            delayed(download_clip_wrapper)(index, output_dir)
+            for index in youtube_ids)
+
+    # Save download report.
+    mmengine.dump(status_list, 'download_report.json')
+    annotation = mmengine.load(anno_file)
+    downloaded = {status[0]: status[1] for status in status_list}
+    annotation = {k: v for k, v in annotation.items() if downloaded[k]}
+
+    if is_bsn_case:
+        anno_file_bak = anno_file.replace('.json', '_bak.json')
+        os.rename(anno_file, anno_file_bak)
+        mmengine.dump(annotation, anno_file)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    is_bsn_case = args.bsn
+    if is_bsn_case:
+        video_list = f'{data_file}/video_info_new.csv'
+        anno_file = f'{data_file}/anet_anno_action.json'
+    else:
+        video_list = f'{data_file}/activity_net.v1-3.min.json'
+        anno_file = video_list
+    main(video_list, output_dir, anno_file, 24, is_bsn_case)
diff --git a/tools/data/activitynet/download_annotations.sh b/tools/data/activitynet/download_annotations.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8af7b938a7325cd6418df81908c9fa7aee5880bf
--- /dev/null
+++ b/tools/data/activitynet/download_annotations.sh
@@ -0,0 +1,12 @@
+DATA_DIR="../../../data/ActivityNet/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+cd ${DATA_DIR}
+
+wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/files/activity_net.v1-3.min.json
+
+cd -
diff --git a/tools/data/activitynet/download_bsn_videos.sh b/tools/data/activitynet/download_bsn_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0b187959bcdf1d861081ff2a3fecc364475132f
--- /dev/null
+++ b/tools/data/activitynet/download_bsn_videos.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+# set up environment
+conda env create -f environment.yml
+source activate activitynet
+pip install --upgrade youtube-dl
+pip install mmcv
+
+DATA_DIR="../../../data/ActivityNet"
+python download.py --bsn
+
+source deactivate activitynet
+conda remove -n activitynet --all
diff --git a/tools/data/activitynet/download_feature_annotations.sh b/tools/data/activitynet/download_feature_annotations.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f59452165ac5c0f0c99e1dd2a1d84d8f7fc4e5b8
--- /dev/null
+++ b/tools/data/activitynet/download_feature_annotations.sh
@@ -0,0 +1,16 @@
+DATA_DIR="../../../data/ActivityNet/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+cd ${DATA_DIR}
+
+wget https://raw.githubusercontent.com/wzmsltw/BSN-boundary-sensitive-network/master/data/activitynet_annotations/anet_anno_action.json
+
+wget https://raw.githubusercontent.com/wzmsltw/BSN-boundary-sensitive-network/master/data/activitynet_annotations/video_info_new.csv
+
+wget https://download.openmmlab.com/mmaction/localization/anet_activity_indexes_val.txt
+
+cd -
diff --git a/tools/data/activitynet/download_features.sh b/tools/data/activitynet/download_features.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c668d77c35e19b66c4cccc9bffdb2c76fbce9813
--- /dev/null
+++ b/tools/data/activitynet/download_features.sh
@@ -0,0 +1,11 @@
+DATA_DIR="../../../data/ActivityNet/activitynet_feature_cuhk/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1ISemndlSDS2FtqQOKL0t3Cjj9yk2yznF' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1ISemndlSDS2FtqQOKL0t3Cjj9yk2yznF" -O "csv_mean_100.zip" && rm -rf /tmp/cookies.txt
+
+unzip csv_mean_100.zip -d ${DATA_DIR}/
+rm csv_mean_100.zip
diff --git a/tools/data/activitynet/download_videos.sh b/tools/data/activitynet/download_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..047c0f199a71d47d781768f66c1d42495814cb6d
--- /dev/null
+++ b/tools/data/activitynet/download_videos.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+# set up environment
+conda env create -f environment.yml
+source activate activitynet
+pip install --upgrade youtube-dl
+pip install mmcv
+
+DATA_DIR="../../../data/ActivityNet"
+python download.py
+
+source deactivate activitynet
+conda remove -n activitynet --all
diff --git a/tools/data/activitynet/environment.yml b/tools/data/activitynet/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..050d6e6a78397564a694fea90f374886da2c2914
--- /dev/null
+++ b/tools/data/activitynet/environment.yml
@@ -0,0 +1,36 @@
+name: activitynet
+channels:
+  - anaconda
+  - menpo
+  - conda-forge
+  - defaults
+dependencies:
+  - ca-certificates=2020.1.1
+  - certifi=2020.4.5.1
+  - ffmpeg=2.8.6
+  - libcxx=10.0.0
+  - libedit=3.1.20181209
+  - libffi=3.3
+  - ncurses=6.2
+  - openssl=1.1.1g
+  - pip=20.0.2
+  - python=3.7.7
+  - readline=8.0
+  - setuptools=46.4.0
+  - sqlite=3.31.1
+  - tk=8.6.8
+  - wheel=0.34.2
+  - xz=5.2.5
+  - zlib=1.2.11
+  - pip:
+    - decorator==4.4.2
+    - intel-openmp==2019.0
+    - joblib==0.15.1
+    - mkl==2019.0
+    - numpy==1.18.4
+    - olefile==0.46
+    - pandas==1.0.3
+    - python-dateutil==2.8.1
+    - pytz==2020.1
+    - six==1.14.0
+    - youtube-dl
diff --git a/tools/data/activitynet/extract_frames.sh b/tools/data/activitynet/extract_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a3496ac9645774afa828c2918b79e5426290de92
--- /dev/null
+++ b/tools/data/activitynet/extract_frames.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+cd ../
+python build_rawframes.py ../../data/ActivityNet/videos/ ../../data/ActivityNet/rawframes/ --level 1 --flow-type tvl1 --ext mp4 --task both  --new-short 256
+echo "Raw frames (RGB and tv-l1) Generated for train set"
+
+cd activitynet/
diff --git a/tools/data/activitynet/generate_rawframes_filelist.py b/tools/data/activitynet/generate_rawframes_filelist.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a08130d75cd9fe9e32be2377e508b21cf87108e
--- /dev/null
+++ b/tools/data/activitynet/generate_rawframes_filelist.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import os.path as osp
+
+data_file = '../../../data/ActivityNet'
+video_list = f'{data_file}/video_info_new.csv'
+anno_file = f'{data_file}/anet_anno_action.json'
+rawframe_dir = f'{data_file}/rawframes'
+action_name_list = 'action_name.csv'
+
+train_rawframe_dir = rawframe_dir
+val_rawframe_dir = rawframe_dir
+
+json_file = f'{data_file}/activity_net.v1-3.min.json'
+
+
+def generate_rawframes_filelist():
+    load_dict = json.load(open(json_file))
+
+    anet_labels = open(action_name_list).readlines()
+    anet_labels = [x.strip() for x in anet_labels[1:]]
+
+    train_dir_list = [
+        osp.join(train_rawframe_dir, x) for x in os.listdir(train_rawframe_dir)
+    ]
+    val_dir_list = [
+        osp.join(val_rawframe_dir, x) for x in os.listdir(val_rawframe_dir)
+    ]
+
+    def simple_label(anno):
+        label = anno[0]['label']
+        return anet_labels.index(label)
+
+    def count_frames(dir_list, video):
+        for dir_name in dir_list:
+            if video in dir_name:
+                return osp.basename(dir_name), len(os.listdir(dir_name))
+        return None, None
+
+    database = load_dict['database']
+    training = {}
+    validation = {}
+    key_dict = {}
+
+    for k in database:
+        data = database[k]
+        subset = data['subset']
+
+        if subset in ['training', 'validation']:
+            annotations = data['annotations']
+            label = simple_label(annotations)
+            if subset == 'training':
+                dir_list = train_dir_list
+                data_dict = training
+            else:
+                dir_list = val_dir_list
+                data_dict = validation
+
+        else:
+            continue
+
+        gt_dir_name, num_frames = count_frames(dir_list, k)
+        if gt_dir_name is None:
+            continue
+        data_dict[gt_dir_name] = [num_frames, label]
+        key_dict[gt_dir_name] = k
+
+    train_lines = [
+        k + ' ' + str(training[k][0]) + ' ' + str(training[k][1])
+        for k in training
+    ]
+    val_lines = [
+        k + ' ' + str(validation[k][0]) + ' ' + str(validation[k][1])
+        for k in validation
+    ]
+
+    with open(osp.join(data_file, 'anet_train_video.txt'), 'w') as fout:
+        fout.write('\n'.join(train_lines))
+    with open(osp.join(data_file, 'anet_val_video.txt'), 'w') as fout:
+        fout.write('\n'.join(val_lines))
+
+    def clip_list(k, anno, video_anno):
+        duration = anno['duration']
+        num_frames = video_anno[0]
+        fps = num_frames / duration
+        segs = anno['annotations']
+        lines = []
+        for seg in segs:
+            segment = seg['segment']
+            label = seg['label']
+            label = anet_labels.index(label)
+            start, end = int(segment[0] * fps), int(segment[1] * fps)
+            if end > num_frames - 1:
+                end = num_frames - 1
+            newline = f'{k} {start} {end - start + 1} {label}'
+            lines.append(newline)
+        return lines
+
+    train_clips, val_clips = [], []
+    for k in training:
+        train_clips.extend(clip_list(k, database[key_dict[k]], training[k]))
+    for k in validation:
+        val_clips.extend(clip_list(k, database[key_dict[k]], validation[k]))
+
+    with open(osp.join(data_file, 'anet_train_clip.txt'), 'w') as fout:
+        fout.write('\n'.join(train_clips))
+    with open(osp.join(data_file, 'anet_val_clip.txt'), 'w') as fout:
+        fout.write('\n'.join(val_clips))
+
+
+if __name__ == '__main__':
+    generate_rawframes_filelist()
diff --git a/tools/data/activitynet/label_map.txt b/tools/data/activitynet/label_map.txt
new file mode 100644
index 0000000000000000000000000000000000000000..643c3646c2665359bacb823bb1d6ede1b1e2f973
--- /dev/null
+++ b/tools/data/activitynet/label_map.txt
@@ -0,0 +1,200 @@
+Applying sunscreen
+Arm wrestling
+Assembling bicycle
+BMX
+Baking cookies
+Baton twirling
+Beach soccer
+Beer pong
+Blow-drying hair
+Blowing leaves
+Playing ten pins
+Braiding hair
+Building sandcastles
+Bullfighting
+Calf roping
+Camel ride
+Canoeing
+Capoeira
+Carving jack-o-lanterns
+Changing car wheel
+Cleaning sink
+Clipping cat claws
+Croquet
+Curling
+Cutting the grass
+Decorating the Christmas tree
+Disc dog
+Doing a powerbomb
+Doing crunches
+Drum corps
+Elliptical trainer
+Doing fencing
+Fixing the roof
+Fun sliding down
+Futsal
+Gargling mouthwash
+Grooming dog
+Hand car wash
+Hanging wallpaper
+Having an ice cream
+Hitting a pinata
+Hula hoop
+Hurling
+Ice fishing
+Installing carpet
+Kite flying
+Kneeling
+Knitting
+Laying tile
+Longboarding
+Making a cake
+Making a lemonade
+Making an omelette
+Mooping floor
+Painting fence
+Painting furniture
+Peeling potatoes
+Plastering
+Playing beach volleyball
+Playing blackjack
+Playing congas
+Playing drums
+Playing ice hockey
+Playing pool
+Playing rubik cube
+Powerbocking
+Putting in contact lenses
+Putting on shoes
+Rafting
+Raking leaves
+Removing ice from car
+Riding bumper cars
+River tubing
+Rock-paper-scissors
+Rollerblading
+Roof shingle removal
+Rope skipping
+Running a marathon
+Scuba diving
+Sharpening knives
+Shuffleboard
+Skiing
+Slacklining
+Snow tubing
+Snowboarding
+Spread mulch
+Sumo
+Surfing
+Swimming
+Swinging at the playground
+Table soccer
+Throwing darts
+Trimming branches or hedges
+Tug of war
+Using the monkey bar
+Using the rowing machine
+Wakeboarding
+Waterskiing
+Waxing skis
+Welding
+Drinking coffee
+Zumba
+Doing kickboxing
+Doing karate
+Tango
+Putting on makeup
+High jump
+Playing bagpipes
+Cheerleading
+Wrapping presents
+Cricket
+Clean and jerk
+Preparing pasta
+Bathing dog
+Discus throw
+Playing field hockey
+Grooming horse
+Preparing salad
+Playing harmonica
+Playing saxophone
+Chopping wood
+Washing face
+Using the pommel horse
+Javelin throw
+Spinning
+Ping-pong
+Making a sandwich
+Brushing hair
+Playing guitarra
+Doing step aerobics
+Drinking beer
+Playing polo
+Snatch
+Paintball
+Long jump
+Cleaning windows
+Brushing teeth
+Playing flauta
+Tennis serve with ball bouncing
+Bungee jumping
+Triple jump
+Horseback riding
+Layup drill in basketball
+Vacuuming floor
+Cleaning shoes
+Doing nails
+Shot put
+Fixing bicycle
+Washing hands
+Ironing clothes
+Using the balance beam
+Shoveling snow
+Tumbling
+Using parallel bars
+Getting a tattoo
+Rock climbing
+Smoking hookah
+Shaving
+Getting a piercing
+Springboard diving
+Playing squash
+Playing piano
+Dodgeball
+Smoking a cigarette
+Sailing
+Getting a haircut
+Playing lacrosse
+Cumbia
+Tai chi
+Painting
+Mowing the lawn
+Shaving legs
+Walking the dog
+Hammer throw
+Skateboarding
+Polishing shoes
+Ballet
+Hand washing clothes
+Plataform diving
+Playing violin
+Breakdancing
+Windsurfing
+Hopscotch
+Doing motocross
+Mixing drinks
+Starting a campfire
+Belly dance
+Removing curlers
+Archery
+Volleyball
+Playing water polo
+Playing racquetball
+Kayaking
+Polishing forniture
+Playing kickball
+Using uneven bars
+Washing dishes
+Pole vault
+Playing accordion
+Playing badminton
diff --git a/tools/data/activitynet/process_annotations.py b/tools/data/activitynet/process_annotations.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbe3e91e6ad67b33b3f484aa498e0ccf949efbe1
--- /dev/null
+++ b/tools/data/activitynet/process_annotations.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This file processes the annotation files and generates proper annotation
+files for localizers."""
+import json
+
+import numpy as np
+
+
+def load_json(file):
+    with open(file) as json_file:
+        data = json.load(json_file)
+        return data
+
+
+data_file = '../../../data/ActivityNet'
+info_file = f'{data_file}/video_info_new.csv'
+ann_file = f'{data_file}/anet_anno_action.json'
+
+anno_database = load_json(ann_file)
+
+video_record = np.loadtxt(info_file, dtype=str, delimiter=',', skiprows=1)
+
+video_dict_train = {}
+video_dict_val = {}
+video_dict_test = {}
+video_dict_full = {}
+
+for _, video_item in enumerate(video_record):
+    video_name = video_item[0]
+    video_info = anno_database[video_name]
+    video_subset = video_item[5]
+    video_info['fps'] = video_item[3].astype(np.float64)
+    video_info['rfps'] = video_item[4].astype(np.float64)
+    video_dict_full[video_name] = video_info
+    if video_subset == 'training':
+        video_dict_train[video_name] = video_info
+    elif video_subset == 'testing':
+        video_dict_test[video_name] = video_info
+    elif video_subset == 'validation':
+        video_dict_val[video_name] = video_info
+
+print(f'full subset video numbers: {len(video_record)}')
+
+with open(f'{data_file}/anet_anno_train.json', 'w') as result_file:
+    json.dump(video_dict_train, result_file)
+
+with open(f'{data_file}/anet_anno_val.json', 'w') as result_file:
+    json.dump(video_dict_val, result_file)
+
+with open(f'{data_file}/anet_anno_test.json', 'w') as result_file:
+    json.dump(video_dict_test, result_file)
+
+with open(f'{data_file}/anet_anno_full.json', 'w') as result_file:
+    json.dump(video_dict_full, result_file)
diff --git a/tools/data/activitynet/tsn_extract_flow_feat_config.py b/tools/data/activitynet/tsn_extract_flow_feat_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..3305a17501221307efe27df9bd472d8ac0bb9542
--- /dev/null
+++ b/tools/data/activitynet/tsn_extract_flow_feat_config.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = [
+    'mmaction::_base_/models/tsn_r50.py', 'mmaction::_base_/default_runtime.py'
+]
+
+clip_len = 5
+model = dict(
+    backbone=dict(in_channels=2 * clip_len),
+    data_preprocessor=dict(mean=[128], std=[128]))
+
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+
+file_client_args = dict(io_backend='disk')
+
+test_pipeline = [
+    dict(type='UntrimmedSampleFrames', clip_len=clip_len, clip_interval=16),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCHW_Flow'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(img=data_root_val),
+        pipeline=test_pipeline,
+        filename_tmpl='{}_{:05d}.jpg',
+        modality='Flow',
+        test_mode=True))
+
+test_evaluator = []
+
+test_cfg = dict(type='TestLoop')
diff --git a/tools/data/activitynet/tsn_extract_rgb_feat_config.py b/tools/data/activitynet/tsn_extract_rgb_feat_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1cd53a4f42ddf95cd789c5a8cdba0170c617d75
--- /dev/null
+++ b/tools/data/activitynet/tsn_extract_rgb_feat_config.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = [
+    'mmaction::_base_/models/tsn_r50.py', 'mmaction::_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+
+file_client_args = dict(io_backend='disk')
+
+test_pipeline = [
+    dict(type='UntrimmedSampleFrames', clip_len=1, clip_interval=16),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(img=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_evaluator = []
+
+test_cfg = dict(type='TestLoop')
diff --git a/tools/data/activitynet/tsn_extract_video_feat_config.py b/tools/data/activitynet/tsn_extract_video_feat_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e323262f27d51cf4dc12c6a3a7231f79576c6f8
--- /dev/null
+++ b/tools/data/activitynet/tsn_extract_video_feat_config.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = [
+    'mmaction::_base_/models/tsn_r50.py', 'mmaction::_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UntrimmedSampleFrames', clip_len=1, clip_interval=16),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_evaluator = []
+
+test_cfg = dict(type='TestLoop')
diff --git a/tools/data/anno_txt2json.py b/tools/data/anno_txt2json.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f9790641c4594551fc179b01665c60366926a16
--- /dev/null
+++ b/tools/data/anno_txt2json.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import mmengine
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert txt annotation list to json')
+    parser.add_argument(
+        'annofile', type=str, help='the txt annotation file to convert')
+    parser.add_argument(
+        '--format',
+        type=str,
+        default='rawframes',
+        choices=['rawframes', 'videos'],
+        help='the format of the txt annotation file')
+    parser.add_argument(
+        '--output',
+        type=str,
+        default=None,
+        help=(
+            'the output file name, use annofile.replace(\'.txt\', \'.json\') '
+            'if the arg value is None'))
+    args = parser.parse_args()
+
+    return args
+
+
+def lines2dictlist(lines, format):
+    """Convert lines in 'txt' format to dictionaries in 'json' format.
+    Currently support single-label and multi-label.
+
+    Example of a single-label rawframes annotation txt file:
+
+    .. code-block:: txt
+
+        (frame_dir num_frames label)
+        some/directory-1 163 1
+        some/directory-2 122 1
+        some/directory-3 258 2
+
+    Example of a multi-label rawframes annotation txt file:
+
+    .. code-block:: txt
+
+        (frame_dir num_frames label1 label2 ...)
+        some/directory-1 163 1 3 5
+        some/directory-2 122 1 2
+        some/directory-3 258 2
+
+    Example of a single-label videos annotation txt file:
+
+    .. code-block:: txt
+
+        (filename label)
+        some/path/000.mp4 1
+        some/path/001.mp4 1
+        some/path/002.mp4 2
+
+    Example of a multi-label videos annotation txt file:
+
+    .. code-block:: txt
+
+        (filename label1 label2 ...)
+        some/path/000.mp4 1 3 5
+        some/path/001.mp4 1 4 8
+        some/path/002.mp4 2 4 9
+
+    Args:
+        lines (list): List of lines in 'txt' label format.
+        format (str): Data format, choices are 'rawframes' and 'videos'.
+
+    Returns:
+        list[dict]: For rawframes format, each dict has keys: frame_dir,
+            total_frames, label; for videos format, each diction has keys:
+            filename, label.
+    """
+    lines = [x.split() for x in lines]
+    if format == 'rawframes':
+        data = [
+            dict(
+                frame_dir=line[0],
+                total_frames=int(line[1]),
+                label=[int(x) for x in line[2:]]) for line in lines
+        ]
+    elif format == 'videos':
+        data = [
+            dict(filename=line[0], label=[int(x) for x in line[1:]])
+            for line in lines
+        ]
+    return data
+
+
+if __name__ == '__main__':
+    # convert txt anno list to json
+    args = parse_args()
+    lines = open(args.annofile).readlines()
+    lines = [x.strip() for x in lines]
+    result = lines2dictlist(lines, args.format)
+    if args.output is None:
+        args.output = args.annofile.replace('.txt', '.json')
+    mmengine.dump(result, args.output)
diff --git a/tools/data/ava/AVA_annotation_explained.md b/tools/data/ava/AVA_annotation_explained.md
new file mode 100644
index 0000000000000000000000000000000000000000..dceab5bf55c666d9fa5d21e561e08ef80a6edc10
--- /dev/null
+++ b/tools/data/ava/AVA_annotation_explained.md
@@ -0,0 +1,34 @@
+# AVA Annotation Explained
+
+In this section, we explain the annotation format of AVA in details:
+
+```
+mmaction2
+├── data
+│   ├── ava
+│   │   ├── annotations
+│   │   |   ├── ava_dense_proposals_train.FAIR.recall_93.9.pkl
+│   │   |   ├── ava_dense_proposals_val.FAIR.recall_93.9.pkl
+│   │   |   ├── ava_dense_proposals_test.FAIR.recall_93.9.pkl
+│   │   |   ├── ava_train_v2.1.csv
+│   │   |   ├── ava_val_v2.1.csv
+│   │   |   ├── ava_train_excluded_timestamps_v2.1.csv
+│   │   |   ├── ava_val_excluded_timestamps_v2.1.csv
+│   │   |   ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt
+```
+
+## The proposals generated by human detectors
+
+In the annotation folder, `ava_dense_proposals_[train/val/test].FAIR.recall_93.9.pkl` are human proposals generated by a human detector. They are used in training, validation and testing respectively. Take `ava_dense_proposals_train.FAIR.recall_93.9.pkl` as an example. It is a dictionary of size 203626. The key consists of the `videoID` and the `timestamp`. For example, the key `-5KQ66BBWC4,0902` means the values are the detection results for the frame at the 902-nd second in the video `-5KQ66BBWC4`. The values in the dictionary are numpy arrays with shape $$N \\times 5$$ , $$N$$ is the number of detected human bounding boxes in the corresponding frame. The format of bounding box is $$\[x_1, y_1, x_2, y_2, score\], 0 \\le x_1, y_1, x_2, w_2, score \\le 1$$. $$(x_1, y_1)$$ indicates the top-left corner of the bounding box, $$(x_2, y_2)$$ indicates the bottom-right corner of the bounding box; $$(0, 0)$$ indicates the top-left corner of the image, while $$(1, 1)$$ indicates the bottom-right corner of the image.
+
+## The ground-truth labels for spatio-temporal action detection
+
+In the annotation folder, `ava_[train/val]_v[2.1/2.2].csv` are ground-truth labels for spatio-temporal action detection, which are used during training & validation. Take `ava_train_v2.1.csv` as an example, it is a csv file with 837318 lines, each line is the annotation for a human instance in one frame. For example, the first line in `ava_train_v2.1.csv` is `'-5KQ66BBWC4,0902,0.077,0.151,0.283,0.811,80,1'`: the first two items `-5KQ66BBWC4` and `0902` indicate that it corresponds to the 902-nd second in the video `-5KQ66BBWC4`. The next four items ($$\[0.077(x_1), 0.151(y_1), 0.283(x_2), 0.811(y_2)\]$$) indicates the location of the bounding box, the bbox format is the same as human proposals. The next item `80` is the action label. The last item `1` is the ID of this bounding box.
+
+## Excluded timestamps
+
+`ava_[train/val]_excludes_timestamps_v[2.1/2.2].csv` contains excluded timestamps which are not used during training or validation. The format is `video_id, second_idx` .
+
+## Label map
+
+`ava_action_list_v[2.1/2.2]_for_activitynet_[2018/2019].pbtxt` contains the label map of the AVA dataset, which maps the action name to the label index.
diff --git a/tools/data/ava/README.md b/tools/data/ava/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b8667620bbe4bbeada903c14be88e2c70b09ab57
--- /dev/null
+++ b/tools/data/ava/README.md
@@ -0,0 +1,148 @@
+# Preparing AVA
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{gu2018ava,
+  title={Ava: A video dataset of spatio-temporally localized atomic visual actions},
+  author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={6047--6056},
+  year={2018}
+}
+```
+
+For basic dataset information, please refer to the official [website](https://research.google.com/ava/index.html).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/ava/`.
+
+## Step 1. Prepare Annotations
+
+First of all, you can run the following script to prepare annotations.
+
+```shell
+bash download_annotations.sh
+```
+
+This command will download `ava_v2.1.zip` for AVA `v2.1` annotation. If you need the AVA `v2.2` annotation, you can try the following script.
+
+```shell
+VERSION=2.2 bash download_annotations.sh
+```
+
+## Step 2. Prepare Videos
+
+Then, use the following script to prepare videos. The codes are adapted from the [official crawler](https://github.com/cvdfoundation/ava-dataset).
+Note that this might take a long time.
+
+```shell
+bash download_videos.sh
+```
+
+Or you can use the following command to downloading AVA videos in parallel using a python script.
+
+```shell
+bash download_videos_parallel.sh
+```
+
+Note that if you happen to have sudoer or have [GNU parallel](https://www.gnu.org/software/parallel/) on your machine,
+you can speed up the procedure by downloading in parallel.
+
+```shell
+# sudo apt-get install parallel
+bash download_videos_gnu_parallel.sh
+```
+
+## Step 3. Cut Videos
+
+Cut each video from its 15th to 30th minute and make them at 30 fps.
+
+```shell
+bash cut_videos.sh
+```
+
+## Step 4. Extract RGB and Flow
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. And you can run the following script to soft link the extracted frames.
+
+```shell
+# execute these two line (Assume the SSD is mounted at "/mnt/SSD/")
+mkdir /mnt/SSD/ava_extracted/
+ln -s /mnt/SSD/ava_extracted/ ../data/ava/rawframes/
+```
+
+If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow.
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+If you didn't install denseflow, you can still extract RGB frames using ffmpeg by the following script.
+
+```shell
+bash extract_rgb_frames_ffmpeg.sh
+```
+
+If both are required, run the following script to extract frames.
+
+```shell
+bash extract_frames.sh
+```
+
+## Step 5. Fetch Proposal Files
+
+The scripts are adapted from FAIR's [Long-Term Feature Banks](https://github.com/facebookresearch/video-long-term-feature-banks).
+
+Run the following scripts to fetch the pre-computed proposal list.
+
+```shell
+bash fetch_ava_proposals.sh
+```
+
+## Step 6. Folder Structure
+
+After the whole data pipeline for AVA preparation.
+you can get the rawframes (RGB + Flow), videos and annotation files for AVA.
+
+In the context of the whole project (for AVA only), the *minimal* folder structure will look like:
+(*minimal* means that some data are not necessary: for example, you may want to evaluate AVA using the original video format.)
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── ava
+│   │   ├── annotations
+│   │   |   ├── ava_dense_proposals_train.FAIR.recall_93.9.pkl
+│   │   |   ├── ava_dense_proposals_val.FAIR.recall_93.9.pkl
+│   │   |   ├── ava_dense_proposals_test.FAIR.recall_93.9.pkl
+│   │   |   ├── ava_train_v2.1.csv
+│   │   |   ├── ava_val_v2.1.csv
+│   │   |   ├── ava_train_excluded_timestamps_v2.1.csv
+│   │   |   ├── ava_val_excluded_timestamps_v2.1.csv
+│   │   |   ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt
+│   │   ├── videos
+│   │   │   ├── 053oq2xB3oU.mkv
+│   │   │   ├── 0f39OWEqJ24.mp4
+│   │   │   ├── ...
+│   │   ├── videos_15min
+│   │   │   ├── 053oq2xB3oU.mkv
+│   │   │   ├── 0f39OWEqJ24.mp4
+│   │   │   ├── ...
+│   │   ├── rawframes
+│   │   │   ├── 053oq2xB3oU
+|   │   │   │   ├── img_00001.jpg
+|   │   │   │   ├── img_00002.jpg
+|   │   │   │   ├── ...
+```
+
+For training and evaluating on AVA, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Reference
+
+1. O. Tange (2018): GNU Parallel 2018, March 2018, https://doi.org/10.5281/zenodo.1146014
diff --git a/tools/data/ava/README_zh-CN.md b/tools/data/ava/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad6d69478372c8f85c186ef636d6b13e42988754
--- /dev/null
+++ b/tools/data/ava/README_zh-CN.md
@@ -0,0 +1,134 @@
+# 准备 AVA
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{gu2018ava,
+  title={Ava: A video dataset of spatio-temporally localized atomic visual actions},
+  author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={6047--6056},
+  year={2018}
+}
+```
+
+请参照 [官方网站](https://research.google.com/ava/index.html) 以获取数据集基本信息。
+在开始之前，用户需确保当前目录为 `$MMACTION2/tools/data/ava/`。
+
+## 1. 准备标注文件
+
+首先，用户可以使用如下脚本下载标注文件并进行预处理：
+
+```shell
+bash download_annotations.sh
+```
+
+这一命令将下载 `ava_v2.1.zip` 以得到 AVA v2.1 标注文件。如用户需要 AVA v2.2 标注文件，可使用以下脚本：
+
+```shell
+VERSION=2.2 bash download_annotations.sh
+```
+
+## 2. 下载视频
+
+用户可以使用以下脚本准备视频，视频准备代码修改自 [官方爬虫](https://github.com/cvdfoundation/ava-dataset)。
+注意这一步骤将花费较长时间。
+
+```shell
+bash download_videos.sh
+```
+
+亦可使用以下脚本，使用 python 并行下载 AVA 数据集视频：
+
+```shell
+bash download_videos_parallel.sh
+```
+
+## 3. 截取视频
+
+截取每个视频中的 15 到 30 分钟，设定帧率为 30。
+
+```shell
+bash cut_videos.sh
+```
+
+## 4. 提取 RGB 帧和光流
+
+在提取之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+如果用户有足够的 SSD 空间，那么建议将视频抽取为 RGB 帧以提升 I/O 性能。用户可以使用以下脚本为抽取得到的帧文件夹建立软连接：
+
+```shell
+# 执行以下脚本 (假设 SSD 被挂载在 "/mnt/SSD/")
+mkdir /mnt/SSD/ava_extracted/
+ln -s /mnt/SSD/ava_extracted/ ../data/ava/rawframes/
+```
+
+如果用户只使用 RGB 帧（由于光流提取非常耗时），可执行以下脚本使用 denseflow 提取 RGB 帧：
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+如果用户未安装 denseflow，可执行以下脚本使用 ffmpeg 提取 RGB 帧：
+
+```shell
+bash extract_rgb_frames_ffmpeg.sh
+```
+
+如果同时需要 RGB 帧和光流，可使用如下脚本抽帧：
+
+```shell
+bash extract_frames.sh
+```
+
+## 5. 下载 AVA 上人体检测结果
+
+以下脚本修改自 [Long-Term Feature Banks](https://github.com/facebookresearch/video-long-term-feature-banks)。
+
+可使用以下脚本下载 AVA 上预先计算的人体检测结果：
+
+```shell
+bash fetch_ava_proposals.sh
+```
+
+## 6. 目录结构
+
+在完整完成 AVA 的数据处理后，将得到帧文件夹（RGB 帧和光流帧），视频以及标注文件。
+
+在整个项目目录下（仅针对 AVA），*最简* 目录结构如下所示：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── ava
+│   │   ├── annotations
+│   │   |   ├── ava_dense_proposals_train.FAIR.recall_93.9.pkl
+│   │   |   ├── ava_dense_proposals_val.FAIR.recall_93.9.pkl
+│   │   |   ├── ava_dense_proposals_test.FAIR.recall_93.9.pkl
+│   │   |   ├── ava_train_v2.1.csv
+│   │   |   ├── ava_val_v2.1.csv
+│   │   |   ├── ava_train_excluded_timestamps_v2.1.csv
+│   │   |   ├── ava_val_excluded_timestamps_v2.1.csv
+│   │   |   ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt
+│   │   ├── videos
+│   │   │   ├── 053oq2xB3oU.mkv
+│   │   │   ├── 0f39OWEqJ24.mp4
+│   │   │   ├── ...
+│   │   ├── videos_15min
+│   │   │   ├── 053oq2xB3oU.mkv
+│   │   │   ├── 0f39OWEqJ24.mp4
+│   │   │   ├── ...
+│   │   ├── rawframes
+│   │   │   ├── 053oq2xB3oU
+|   │   │   │   ├── img_00001.jpg
+|   │   │   │   ├── img_00002.jpg
+|   │   │   │   ├── ...
+```
+
+关于 AVA 数据集上的训练与测试，请参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/ava/cut_videos.sh b/tools/data/ava/cut_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6912216543c1c9bc3cb1fe691aa9561ed03fa050
--- /dev/null
+++ b/tools/data/ava/cut_videos.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+# Cut each video from its 15th to 30th minute.
+
+IN_DATA_DIR="../../../data/ava/videos"
+OUT_DATA_DIR="../../../data/ava/videos_15min"
+
+if [[ ! -d "${OUT_DATA_DIR}" ]]; then
+  echo "${OUT_DATA_DIR} doesn't exist. Creating it.";
+  mkdir -p ${OUT_DATA_DIR}
+fi
+
+for video in $(ls -A1 -U ${IN_DATA_DIR}/*)
+do
+  out_name="${OUT_DATA_DIR}/${video##*/}"
+  if [ ! -f "${out_name}" ]; then
+    ffmpeg -ss 900 -t 901 -i "${video}" -r 30 -strict experimental "${out_name}"
+  fi
+done
diff --git a/tools/data/ava/download_annotations.sh b/tools/data/ava/download_annotations.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3839d6016819fcc689c13f27b849d5b4cf9c8fea
--- /dev/null
+++ b/tools/data/ava/download_annotations.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+set -e
+
+VERSION=${VERSION:-"2.1"}
+DATA_DIR="../../../data/ava/annotations"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget https://research.google.com/ava/download/ava_v${VERSION}.zip
+unzip -j ava_v${VERSION}.zip -d ${DATA_DIR}/
+rm ava_v${VERSION}.zip
diff --git a/tools/data/ava/download_videos.sh b/tools/data/ava/download_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a26d19be793d1f394ddaef6bd17f398cbe63c432
--- /dev/null
+++ b/tools/data/ava/download_videos.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+set -e
+
+DATA_DIR="../../../data/ava/videos"
+ANNO_DIR="../../../data/ava/annotations"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget https://s3.amazonaws.com/ava-dataset/annotations/ava_file_names_trainval_v2.1.txt -P ${ANNO_DIR}
+
+cat ${ANNO_DIR}/ava_file_names_trainval_v2.1.txt |
+while read vid;
+    do wget -c "https://s3.amazonaws.com/ava-dataset/trainval/${vid}" -P ${DATA_DIR}; done
+
+echo "Downloading finished."
diff --git a/tools/data/ava/download_videos_gnu_parallel.sh b/tools/data/ava/download_videos_gnu_parallel.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7e4d37d19336123dfcd80bc4c308fad6ad0cdd78
--- /dev/null
+++ b/tools/data/ava/download_videos_gnu_parallel.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+set -e
+
+DATA_DIR="../../../data/ava/videos"
+ANNO_DIR="../../../data/ava/annotations"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget https://s3.amazonaws.com/ava-dataset/annotations/ava_file_names_trainval_v2.1.txt -P ${ANNO_DIR}
+
+# sudo apt-get install parallel
+# parallel downloading to speed up
+awk '{print "https://s3.amazonaws.com/ava-dataset/trainval/"$0}' ${ANNO_DIR}/ava_file_names_trainval_v2.1.txt |
+parallel -j 8 wget -c -q {} -P ${DATA_DIR}
+
+echo "Downloading finished."
diff --git a/tools/data/ava/download_videos_parallel.py b/tools/data/ava/download_videos_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ed622f7b202b79ff78e03b3b340177cdd13294f
--- /dev/null
+++ b/tools/data/ava/download_videos_parallel.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+import subprocess
+
+import mmengine
+from joblib import Parallel, delayed
+
+URL_PREFIX = 'https://s3.amazonaws.com/ava-dataset/trainval/'
+
+
+def download_video(video_url, output_dir, num_attempts=5):
+    video_file = osp.basename(video_url)
+    output_file = osp.join(output_dir, video_file)
+
+    status = False
+
+    if not osp.exists(output_file):
+        command = ['wget', '-c', video_url, '-P', output_dir]
+        command = ' '.join(command)
+        print(command)
+        attempts = 0
+        while True:
+            try:
+                subprocess.check_output(
+                    command, shell=True, stderr=subprocess.STDOUT)
+            except subprocess.CalledProcessError:
+                attempts += 1
+                if attempts == num_attempts:
+                    return status, 'Downloading Failed'
+            else:
+                break
+
+    status = osp.exists(output_file)
+    return status, 'Downloaded'
+
+
+def main(source_file, output_dir, num_jobs=24, num_attempts=5):
+    mmengine.mkdir_or_exist(output_dir)
+    video_list = open(source_file).read().strip().split('\n')
+    video_list = [osp.join(URL_PREFIX, video) for video in video_list]
+
+    if num_jobs == 1:
+        status_list = []
+        for video in video_list:
+            video_list.append(download_video(video, output_dir, num_attempts))
+    else:
+        status_list = Parallel(n_jobs=num_jobs)(
+            delayed(download_video)(video, output_dir, num_attempts)
+            for video in video_list)
+
+    mmengine.dump(status_list, 'download_report.json')
+
+
+if __name__ == '__main__':
+    description = 'Helper script for downloading AVA videos'
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument(
+        'source_file', type=str, help='TXT file containing the video filename')
+    parser.add_argument(
+        'output_dir',
+        type=str,
+        help='Output directory where videos will be saved')
+    parser.add_argument('-n', '--num-jobs', type=int, default=24)
+    parser.add_argument('--num-attempts', type=int, default=5)
+    main(**vars(parser.parse_args()))
diff --git a/tools/data/ava/download_videos_parallel.sh b/tools/data/ava/download_videos_parallel.sh
new file mode 100644
index 0000000000000000000000000000000000000000..56810a72a2c73f8b4226f95b41ae9fdda4898cc5
--- /dev/null
+++ b/tools/data/ava/download_videos_parallel.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+set -e
+
+DATA_DIR="../../../data/ava/videos"
+ANNO_DIR="../../../data/ava/annotations"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget https://s3.amazonaws.com/ava-dataset/annotations/ava_file_names_trainval_v2.1.txt -P ${ANNO_DIR}
+
+python download_videos_parallel.py ${ANNO_DIR}/ava_file_names_trainval_v2.1.txt ${DATA_DIR}
diff --git a/tools/data/ava/extract_frames.sh b/tools/data/ava/extract_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..31be7ff066e36b006370eb4a84b7e9b822ef835c
--- /dev/null
+++ b/tools/data/ava/extract_frames.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/ava/videos_15min/ ../../data/ava/rawframes/ --task both --level 1 --flow-type tvl1 --mixed-ext
+echo "Raw frames (RGB and Flow) Generated"
+cd ava/
diff --git a/tools/data/ava/extract_rgb_frames.sh b/tools/data/ava/extract_rgb_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..afcd8fd7651369e7ca6d98f6c7c750d2f095947d
--- /dev/null
+++ b/tools/data/ava/extract_rgb_frames.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/ava/videos_15min/ ../../data/ava/rawframes/ --task rgb --level 1 --mixed-ext
+echo "Genearte raw frames (RGB only)"
+
+cd ava/
diff --git a/tools/data/ava/extract_rgb_frames_ffmpeg.sh b/tools/data/ava/extract_rgb_frames_ffmpeg.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1a3af335e83b50a93646c49835a9f415d886574e
--- /dev/null
+++ b/tools/data/ava/extract_rgb_frames_ffmpeg.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+# Extract frames from videos.
+
+IN_DATA_DIR="../../../data/ava/videos_15min"
+OUT_DATA_DIR="../../../data/ava/rawframes"
+
+if [[ ! -d "${OUT_DATA_DIR}" ]]; then
+  echo "${OUT_DATA_DIR} doesn't exist. Creating it.";
+  mkdir -p ${OUT_DATA_DIR}
+fi
+
+for video in $(ls -A1 -U ${IN_DATA_DIR}/*)
+do
+  video_name=${video##*/}
+
+  if [[ $video_name = *".webm" ]]; then
+    video_name=${video_name::-5}
+  else
+    video_name=${video_name::-4}
+  fi
+
+  out_video_dir=${OUT_DATA_DIR}/${video_name}
+  mkdir -p "${out_video_dir}"
+
+  out_name="${out_video_dir}/img_%05d.jpg"
+
+  ffmpeg -i "${video}" -r 30 -q:v 1 "${out_name}"
+done
diff --git a/tools/data/ava/fetch_ava_proposals.sh b/tools/data/ava/fetch_ava_proposals.sh
new file mode 100644
index 0000000000000000000000000000000000000000..18fdb67be4d216a2bcdf8e2f12ff40507b001921
--- /dev/null
+++ b/tools/data/ava/fetch_ava_proposals.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+set -e
+
+DATA_DIR="../../../data/ava/annotations"
+
+wget https://download.openmmlab.com/mmaction/dataset/ava/ava_dense_proposals_train.FAIR.recall_93.9.pkl -P ${DATA_DIR}
+wget https://download.openmmlab.com/mmaction/dataset/ava/ava_dense_proposals_val.FAIR.recall_93.9.pkl -P ${DATA_DIR}
+wget https://download.openmmlab.com/mmaction/dataset/ava/ava_dense_proposals_test.FAIR.recall_93.9.pkl -P ${DATA_DIR}
diff --git a/tools/data/ava/label_map.txt b/tools/data/ava/label_map.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7d0547b81309d9aa2538e88321b3fc5ecdeb9096
--- /dev/null
+++ b/tools/data/ava/label_map.txt
@@ -0,0 +1,60 @@
+1: bend/bow (at the waist)
+3: crouch/kneel
+4: dance
+5: fall down
+6: get up
+7: jump/leap
+8: lie/sleep
+9: martial art
+10: run/jog
+11: sit
+12: stand
+13: swim
+14: walk
+15: answer phone
+17: carry/hold (an object)
+20: climb (e.g., a mountain)
+22: close (e.g., a door, a box)
+24: cut
+26: dress/put on clothing
+27: drink
+28: drive (e.g., a car, a truck)
+29: eat
+30: enter
+34: hit (an object)
+36: lift/pick up
+37: listen (e.g., to music)
+38: open (e.g., a window, a car door)
+41: play musical instrument
+43: point to (an object)
+45: pull (an object)
+46: push (an object)
+47: put down
+48: read
+49: ride (e.g., a bike, a car, a horse)
+51: sail boat
+52: shoot
+54: smoke
+56: take a photo
+57: text on/look at a cellphone
+58: throw
+59: touch (an object)
+60: turn (e.g., a screwdriver)
+61: watch (e.g., TV)
+62: work on a computer
+63: write
+64: fight/hit (a person)
+65: give/serve (an object) to (a person)
+66: grab (a person)
+67: hand clap
+68: hand shake
+69: hand wave
+70: hug (a person)
+72: kiss (a person)
+73: lift (a person)
+74: listen to (a person)
+76: push (another person)
+77: sing to (e.g., self, a person, a group)
+78: take (an object) from (a person)
+79: talk to (e.g., self, a person, a group)
+80: watch (a person)
diff --git a/tools/data/ava_kinetics/README.md b/tools/data/ava_kinetics/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e636b5dd39b99cbdefe865c15baaac9efe1dba05
--- /dev/null
+++ b/tools/data/ava_kinetics/README.md
@@ -0,0 +1,173 @@
+# Preparing AVA-Kinetics
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{li2020ava,
+  title={The ava-kinetics localized human actions video dataset},
+  author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew},
+  journal={arXiv preprint arXiv:2005.00214},
+  year={2020}
+}
+```
+
+For basic dataset information, please refer to the official [website](https://research.google.com/ava/index.html).
+AVA-Kinetics dataset is a crossover between the AVA Actions and Kinetics datasets. You may want to first prepare the AVA datasets. In this file, we provide commands to prepare the Kinetics part and merge the two parts together.
+
+For model training, we will keep reading from raw frames for the AVA part, but read from videos using `decord` for the Kinetics part to accelerate training.
+
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/ava_kinetics/`.
+
+## Step 1. Prepare the Kinetics700 dataset
+
+The Kinetics part of the AVA-Kinetics dataset are sampled from the Kinetics-700 dataset.
+
+It is best if you have prepared the Kinetics-700 dataset (only videos required) following
+[Preparing Kinetics](https://github.com/open-mmlab/mmaction2/tree/master/tools/data/kinetics). We will also have alternative method to prepare these videos if you do not have enough storage (coming soon).
+
+We will need the videos of this dataset (`$MMACTION2/data/kinetics700/videos_train`) and the videos file list (`$MMACTION2/data/kinetics700/kinetics700_train_list_videos.txt`), which is generated by [Step 4 in Preparing Kinetics](https://github.com/open-mmlab/mmaction2/tree/master/tools/data/kinetics#step-4-generate-file-list)
+
+The format of the file list should be:
+
+```
+Path_to_video_1 label_1\n
+Path_to_video_2 label_2\n
+...
+Path_to_video_n label_n\n
+```
+
+The timestamp (start and end of the video) must be contained. For example:
+
+```
+class602/o3lCwWyyc_s_000012_000022.mp4 602\n
+```
+
+It means that this video clip is the 12th to 22nd seconds of the original video. It is okay if some videos are missing, and we will ignore them in the next steps.
+
+## Step 2. Download Annotations
+
+Download the annotation tar file (recall that the directory should be located at `$MMACTION2/tools/data/ava_kinetics/`).
+
+```shell
+wget https://storage.googleapis.com/deepmind-media/Datasets/ava_kinetics_v1_0.tar.gz
+tar xf ava_kinetics_v1_0.tar.gz && rm ava_kinetics_v1_0.tar.gz
+```
+
+You should have the `ava_kinetics_v1_0` folder at `$MMACTION2/tools/data/ava_kinetics/`.
+
+## Step 3. Cut Videos
+
+Use `cut_kinetics.py` to find the desired videos from the Kinetics-700 dataset and trim them to contain only annotated clips. Currently we only use the train set of the Kinetics part to improve training. Validation on the Kinetics part will come soon.
+
+Here is the script:
+
+```shell
+python3 cut_kinetics.py --avakinetics_anotation=$AVAKINETICS_ANOTATION \
+                        --kinetics_list=$KINETICS_LIST \
+                        --avakinetics_root=$AVAKINETICS_ROOT \
+                        [--num_workers=$NUM_WORKERS ]
+```
+
+Arguments:
+
+- `avakinetics_anotation`: the directory to ava-kinetics anotations. Defaults to `./ava_kinetics_v1_0`.
+- `kinetics_list`: the path to the videos file list as mentioned in Step 1. If you have prepared the Kinetics700 dataset following `mmaction2`, it should be `$MMACTION2/data/kinetics700/kinetics700_train_list_videos.txt`.
+- `avakinetics_root`: the directory to save the ava-kinetics dataset. Defaults to `$MMACTION2/data/ava_kinetics`.
+- `num_workers`: number of workers used to cut videos. Defaults to -1 and use all available cpus.
+
+There should be about 100k videos. It is OK if some videos are missing and we will ignore them in the next steps.
+
+## Step 4. Extract RGB Frames
+
+This step is similar to Step 4 in [Preparing AVA](https://github.com/open-mmlab/mmaction2/tree/main/tools/data/ava#step-4-extract-rgb-and-flow).
+
+Here we provide a script to extract RGB frames using ffmpeg:
+
+```shell
+python3 extract_rgb_frames.py --avakinetics_root=$AVAKINETICS_ROOT \
+                              [--num_workers=$NUM_WORKERS ]
+```
+
+Arguments:
+
+- `avakinetics_root`: the directory to save the ava-kinetics dataset. Defaults to `$MMACTION2/data/ava_kinetics`.
+- `num_workers`: number of workers used to extract frames. Defaults to -1 and use all available cpus.
+
+If you have installed denseflow, you can also use `build_rawframes.py` to extract RGB frames:
+
+```shell
+python3 ../build_rawframes.py ../../../data/ava_kinetics/videos/ ../../../data/ava_kinetics/rawframes/ --task rgb --level 1 --mixed-ext
+```
+
+## Step 5. Prepare Annotations
+
+Use `prepare_annotation.py` to prepare the training annotations. It will generate a `kinetics_train.csv` file containning the spatial-temporal annotations for the Kinetics part, localting at `$AVAKINETICS_ROOT`.
+
+Here is the script:
+
+```shell
+python3 prepare_annotation.py --avakinetics_anotation=$AVAKINETICS_ANOTATION \
+                              --avakinetics_root=$AVAKINETICS_ROOT \
+                              [--num_workers=$NUM_WORKERS]
+```
+
+Arguments:
+
+- `avakinetics_anotation`: the directory to ava-kinetics anotations. Defaults to `./ava_kinetics_v1_0`.
+- `avakinetics_root`: the directory to save the ava-kinetics dataset. Defaults to `$MMACTION2/data/ava_kinetics`.
+- `num_workers`: number of workers used to prepare annotations. Defaults to -1 and use all available cpus.
+
+## Step 6. Fetch Proposal Files
+
+The pre-computed proposals for AVA dataset are provided by FAIR's [Long-Term Feature Banks](https://github.com/facebookresearch/video-long-term-feature-banks). For the Kinetics part, we use `Cascade R-CNN X-101-64x4d-FPN` from [mmdetection](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702-43ce6a30.pth) to fetch the proposals. Here is the script:
+
+```shell
+python3 fetch_proposal.py --avakinetics_root=$AVAKINETICS_ROOT \
+                          --datalist=$DATALIST \
+                          --picklepath=$PICKLEPATH \
+                          [--config=$CONFIG ] \
+                          [--checkpoint=$CHECKPOINT ]
+
+```
+
+It  will generate a `kinetics_proposal.pkl` file at `$MMACTION2/data/ava_kinetics/`.
+
+Arguments:
+
+- `avakinetics_root`: the directory to save the ava-kinetics dataset. Defaults to `$MMACTION2/data/ava_kinetics`.
+- `datalist`: path to the `kinetics_train.csv` file generated at Step 3.
+- `picklepath`: path to save the extracted proposal pickle file.
+- `config`: the config file for the human detection model. Defaults to `X-101-64x4d-FPN.py`.
+- `checkpoint`: the checkpoint for the human detection model. Defaults to the `mmdetection` pretraining checkpoint.
+
+## Step 7. Merge AVA to AVA-Kinetics
+
+Now we are done with the preparations for the Kinetics part. We need to merge the AVA part into the `ava_kinetics` folder (assuming you have AVA dataset ready at `$MMACTION2/data/ava`). First we make a copy of the AVA anotation to the `ava_kinetics` folder (recall that you are at `$MMACTION2/tools/data/ava_kinetics/`):
+
+```shell
+cp -r ../../../data/ava/annotations/ ../../../data/ava_kinetics/
+```
+
+Next we merge the generated anotation files of the Kinetics part to AVA. Please check: you should have two files `kinetics_train.csv` and `kinetics_proposal.pkl` at `$MMACTION2/data/ava_kinetics/` generated from Step 5 and Step 6. Run the following script to merge these two files into `$MMACTION2/data/ava_kinetics/annotations/ava_train_v2.2.csv` and `$MMACTION2/data/ava_kinetics/annotations/ava_dense_proposals_train.FAIR.recall_93.9.pkl` respectively.
+
+```shell
+python3 merge_annotations.py --avakinetics_root=$AVAKINETICS_ROOT
+```
+
+Arguments:
+
+- `avakinetics_root`: the directory to save the ava-kinetics dataset. Defaults to `$MMACTION2/data/ava_kinetics`.
+
+Finally, we need to merge the rawframes of AVA part. You can either copy/move them or generate soft links. The following script is an example to use soft links:
+
+```shell
+python3 softlink_ava.py --avakinetics_root=$AVAKINETICS_ROOT \
+                        --ava_root=$AVA_ROOT
+```
+
+Arguments:
+
+- `avakinetics_root`: the directory to save the ava-kinetics dataset. Defaults to `$MMACTION2/data/ava_kinetics`.
+- `ava_root`: the directory to save the ava dataset. Defaults to `$MMACTION2/data/ava`.
diff --git a/tools/data/ava_kinetics/X-101-64x4d-FPN.py b/tools/data/ava_kinetics/X-101-64x4d-FPN.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a27ac7aa90e58e24a916432e692d3e371527b02
--- /dev/null
+++ b/tools/data/ava_kinetics/X-101-64x4d-FPN.py
@@ -0,0 +1,147 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+model = dict(
+    type='CascadeRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'),
+        groups=64,
+        base_width=4),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[0.0, 0.0, 0.0, 0.0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(
+            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(
+                type='LoadImageFromFile',
+                file_client_args=dict(backend='disk')),
+            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                type='PackDetInputs',
+                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                           'scale_factor'))
+        ]))
+
+test_evaluator = dict(
+    type='CocoMetric',
+    ann_file='data/coco/annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False)
+
+test_cfg = dict(type='TestLoop')
diff --git a/tools/data/ava_kinetics/cut_kinetics.py b/tools/data/ava_kinetics/cut_kinetics.py
new file mode 100644
index 0000000000000000000000000000000000000000..f06459bf75827f5afa0f3a26e48d9e815a8e1854
--- /dev/null
+++ b/tools/data/ava_kinetics/cut_kinetics.py
@@ -0,0 +1,185 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import multiprocessing
+import os
+from collections import defaultdict
+from typing import List
+
+import decord
+
+
+def get_kinetics_frames(kinetics_anotation_file: str) -> dict:
+    """Given the AVA-kinetics anotation file, return a lookup to map the video
+    id and the the set of timestamps involved of this video id.
+
+    Args:
+        kinetics_anotation_file (str): Path to the AVA-like anotation file for
+            the kinetics subset.
+    Returns:
+        dict: the dict keys are the kinetics videos' video id. The values are
+            the set of timestamps involved.
+    """
+    with open(kinetics_anotation_file) as f:
+        anotated_frames = [i.split(',') for i in f.readlines()]
+        anotated_frames = [i for i in anotated_frames if len(i) == 7]
+        anotated_frames = [(i[0], int(float(i[1]))) for i in anotated_frames]
+
+    frame_lookup = defaultdict(set)
+    for video_id, timestamp in anotated_frames:
+        frame_lookup[video_id].add(timestamp)
+    return frame_lookup
+
+
+def filter_missing_videos(kinetics_list: str, frame_lookup: dict) -> dict:
+    """Given the kinetics700 dataset list, remove the video ids from the lookup
+    that are missing videos or frames.
+
+    Args:
+        kinetics_list (str): Path to the kinetics700 dataset list.
+            The content of the list should be:
+                ```
+                Path_to_video1 label_1\n
+                Path_to_video2 label_2\n
+                ...
+                Path_to_videon label_n\n
+                ```
+            The start and end of the video must be contained in the filename.
+            For example:
+                ```
+                class602/o3lCwWyyc_s_000012_000022.mp4\n
+                ```
+        frame_lookup (dict): the dict from `get_kinetics_frames`.
+    Returns:
+        dict: the dict keys are the kinetics videos' video id. The values are
+            the a list of tuples:
+                (start_of_the_video, end_of_the_video, video_path)
+    """
+    video_lookup = defaultdict(set)
+    with open(kinetics_list) as f:
+        for line in f.readlines():
+            video_path = line.split(' ')[0]  # remove label information
+            video_name = video_path.split('/')[-1]  # get the file name
+            video_name = video_name.split('.')[0]  # remove file extensions
+            video_name = video_name.split('_')
+            video_id = '_'.join(video_name[:-2])
+            if video_id not in frame_lookup:
+                continue
+
+            start, end = int(video_name[-2]), int(video_name[-1])
+            frames = frame_lookup[video_id]
+            frames = [frame for frame in frames if start < frame < end]
+            if len(frames) == 0:
+                continue
+
+            start, end = max(start, min(frames) - 2), min(end, max(frames) + 2)
+            video_lookup[video_id].add((start, end, video_path))
+
+    # Some frame ids exist in multiple videos in the Kinetics dataset.
+    # The reason is the part of one video may fall into different categories.
+    # Remove the duplicated records
+    for video in video_lookup:
+        if len(video_lookup[video]) == 1:
+            continue
+        info_list = list(video_lookup[video])
+        removed_list = []
+        for i, info_i in enumerate(info_list):
+            start_i, end_i, _ = info_i
+            for j in range(i + 1, len(info_list)):
+                start_j, end_j, _ = info_list[j]
+                if start_i <= start_j and end_j <= end_i:
+                    removed_list.append(j)
+                elif start_j <= start_i and end_i <= end_j:
+                    removed_list.append(i)
+        new_list = []
+        for i, info in enumerate(info_list):
+            if i not in removed_list:
+                new_list.append(info)
+        video_lookup[video] = set(new_list)
+    return video_lookup
+
+
+template = ('ffmpeg -ss %d -t %d -accurate_seek -i'
+            ' %s -r 30 -avoid_negative_ts 1 %s')
+
+
+def generate_cut_cmds(video_lookup: dict, data_root: str) -> List[str]:
+    cmds = []
+    for video_id in video_lookup:
+        for start, end, video_path in video_lookup[video_id]:
+            start0 = int(video_path.split('_')[-2])
+            new_path = '%s/%s_%06d_%06d.mp4' % (data_root, video_id, start,
+                                                end)
+            cmd = template % (start - start0, end - start, video_path,
+                              new_path)
+            cmds.append(cmd)
+    return cmds
+
+
+def run_cmd(cmd):
+    os.system(cmd)
+    return
+
+
+def remove_failed_video(video_path: str) -> None:
+    """Given the path to the video, delete the video if it cannot be read or if
+    the actual length of the video is 0.75 seconds shorter than expected."""
+    try:
+        v = decord.VideoReader(video_path)
+        fps = v.get_avg_fps()
+        num_frames = len(v)
+        x = video_path.split('.')[0].split('_')
+        time = int(x[-1]) - int(x[-2])
+        if num_frames < (time - 3 / 4) * fps:
+            os.remove(video_path)
+    except:  # noqa: E722
+        os.remove(video_path)
+    return
+
+
+if __name__ == '__main__':
+    p = argparse.ArgumentParser()
+    p.add_argument(
+        '--avakinetics_anotation',
+        type=str,
+        default='./ava_kinetics_v1_0',
+        help='the directory to ava-kinetics anotations')
+    p.add_argument(
+        '--kinetics_list',
+        type=str,
+        help='the datalist of the kinetics700 training videos')
+    p.add_argument(
+        '--num_workers',
+        type=int,
+        default=-1,
+        help='number of workers used for multiprocessing')
+    p.add_argument(
+        '--avakinetics_root',
+        type=str,
+        default='../../../data/ava_kinetics',
+        help='the path to save ava-kinetics dataset')
+    args = p.parse_args()
+
+    if args.num_workers > 0:
+        num_workers = args.num_workers
+    else:
+        num_workers = max(multiprocessing.cpu_count() - 1, 1)
+
+    # Find videos from the Kinetics700 dataset required for AVA-Kinetics
+    kinetics_train = args.avakinetics_anotation + '/kinetics_train_v1.0.csv'
+    frame_lookup = get_kinetics_frames(kinetics_train)
+    video_lookup = filter_missing_videos(args.kinetics_list, frame_lookup)
+
+    root = args.avakinetics_root
+    os.makedirs(root, exist_ok=True)
+    video_path = root + '/videos/'
+    os.makedirs(video_path, exist_ok=True)
+    all_cmds = generate_cut_cmds(video_lookup, video_path)
+
+    # Cut and save the videos for AVA-Kinetics
+    pool = multiprocessing.Pool(num_workers)
+    _ = pool.map(run_cmd, all_cmds)
+
+    # Remove failed videos
+    videos = os.listdir(video_path)
+    videos = ['%s/%s' % (video_path, video) for video in videos]
+    _ = pool.map(remove_failed_video, videos)
diff --git a/tools/data/ava_kinetics/extract_rgb_frames.py b/tools/data/ava_kinetics/extract_rgb_frames.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa2d83b472f5773b8d5517b009b733703e50fe28
--- /dev/null
+++ b/tools/data/ava_kinetics/extract_rgb_frames.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import multiprocessing
+import os
+
+
+def extract_rgb(video_name, frame_path, video_path):
+    video_id = video_name.split('.')[0]
+    os.makedirs('%s/%s' % (frame_path, video_id), exist_ok=True)
+    cmd = 'ffmpeg -i %s/%s -r 30 -q:v 1 %s/%s' % (video_path, video_name,
+                                                  frame_path, video_id)
+    cmd += '/img_%05d.jpg'
+    return cmd
+
+
+def run_cmd(cmd):
+    os.system(cmd)
+    return
+
+
+if __name__ == '__main__':
+    p = argparse.ArgumentParser()
+    p.add_argument(
+        '--avakinetics_root',
+        type=str,
+        default='../../../data/ava_kinetics',
+        help='the path to save ava-kinetics dataset')
+    p.add_argument(
+        '--num_workers',
+        type=int,
+        default=-1,
+        help='number of workers used for multiprocessing')
+    args = p.parse_args()
+
+    if args.num_workers > 0:
+        num_workers = args.num_workers
+    else:
+        num_workers = max(multiprocessing.cpu_count() - 1, 1)
+
+    root = args.avakinetics_root
+    video_path = root + '/videos/'
+    frame_path = root + '/rawframes/'
+    os.makedirs(frame_path, exist_ok=True)
+
+    all_cmds = [
+        extract_rgb(video_name, frame_path, video_path)
+        for video_name in os.listdir(video_path)
+    ]
+
+    pool = multiprocessing.Pool(num_workers)
+    out = pool.map(run_cmd, all_cmds)
diff --git a/tools/data/ava_kinetics/fetch_proposal.py b/tools/data/ava_kinetics/fetch_proposal.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0dd932faf8698d0152246b0ca0e34bef11bafec
--- /dev/null
+++ b/tools/data/ava_kinetics/fetch_proposal.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import multiprocessing as mp
+import os
+import pickle
+
+import numpy as np
+from mmdet.apis import inference_detector, init_detector
+from mmdet.utils import register_all_modules
+from PIL import Image
+
+
+def get_vid_from_path(path):
+    video_id = path.split('/')[-1]
+    video_id = video_id.split('_')[:-2]
+    return '_'.join(video_id)
+
+
+def prepare_det_lookup(datalist, frame_root):
+    with open(datalist) as f:
+        records = f.readlines()
+    det_lookup = {}
+    for record in records:
+        record = record.split(',')
+        folder_path = record[0]
+        video_id = get_vid_from_path(folder_path)
+        frame_id = int(record[1])
+        for idx in range(frame_id - 1, frame_id + 2):
+            proposal_id = '%s,%04d' % (video_id, idx)
+            det_lookup[proposal_id] = '%s/%s' % (frame_root, folder_path)
+    return det_lookup
+
+
+def single_worker(rank, det_lookup, args):
+    detect_list = list(det_lookup)
+    detect_sublist = [
+        detect_list[i] for i in range(len(detect_list))
+        if i % args.num_gpus == rank
+    ]
+
+    # register all modules in mmdet into the registries
+    register_all_modules()
+    model = init_detector(
+        args.config, args.checkpoint, device='cuda:%d' % rank)
+
+    lookup = {}
+    for count, key in enumerate(detect_sublist):
+        try:
+            folder_path = det_lookup[key]
+            start = int(folder_path.split('/')[-1].split('_')[-2])
+            time = int(key.split(',')[1])
+            frame_id = (time - start) * 30 + 1
+            frame_path = '%s/img_%05d.jpg' % (folder_path, frame_id)
+            img = Image.open(frame_path)
+            result = inference_detector(model, frame_path)
+            bboxes = result._pred_instances.bboxes.cpu()
+            scores = result._pred_instances.scores.cpu()
+            labels = result._pred_instances.labels.cpu()
+
+            bboxes = bboxes[labels == 0]
+            scores = scores[labels == 0]
+
+            bboxes = bboxes[scores > 0.7].numpy()
+            scores = scores[scores > 0.7]
+            if scores.numel() > 0:
+                result_ = []
+                for idx, (h1, w1, h2, w2) in enumerate(bboxes):
+                    h1 /= img.size[0]
+                    h2 /= img.size[0]
+                    w1 /= img.size[1]
+                    w2 /= img.size[1]
+                    score = scores[idx].item()
+                    result_.append((h1, w1, h2, w2, score))
+                lookup[key] = np.array(result_)
+        except:  # noqa: E722
+            pass
+
+    with open('tmp_person_%d.pkl' % rank, 'wb') as f:
+        pickle.dump(lookup, f)
+    return
+
+
+if __name__ == '__main__':
+    p = argparse.ArgumentParser()
+    p.add_argument(
+        '--avakinetics_root',
+        type=str,
+        default='../../../data/ava_kinetics',
+        help='the path to save ava-kinetics dataset')
+    p.add_argument(
+        '--datalist',
+        type=str,
+        default='../../../data/ava_kinetics/kinetics_train.csv',
+        help='the list for kinetics videos')
+    p.add_argument(
+        '--config',
+        type=str,
+        default='X-101-64x4d-FPN.py',
+        help='the human detector')
+    p.add_argument(
+        '--checkpoint',
+        type=str,
+        default='https://download.openmmlab.com/mmdetection/v2.0/'
+        'cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/'
+        'cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_'
+        '075702-43ce6a30.pth',
+        help='the human detector checkpoint')
+    p.add_argument(
+        '--picklepath',
+        type=str,
+        default='../../../data/ava_kinetics/kinetics_proposal.pkl')
+    p.add_argument('--num_gpus', type=int, default=8)
+
+    args = p.parse_args()
+
+    frame_root = args.avakinetics_root + '/rawframes/'
+    det_lookup = prepare_det_lookup(args.datalist, frame_root)
+
+    processes = []
+    for rank in range(args.num_gpus):
+        ctx = mp.get_context('spawn')
+        p = ctx.Process(target=single_worker, args=(rank, det_lookup, args))
+        p.start()
+        processes.append(p)
+
+    for p in processes:
+        p.join()
+
+    lookup = {}
+    for k in range(args.num_gpus):
+        one_lookup = pickle.load(open('tmp_person_%d.pkl' % k, 'rb'))
+        os.remove('tmp_person_%d.pkl' % k)
+        for key in one_lookup:
+            lookup[key] = one_lookup[key]
+
+    with open(args.picklepath, 'wb') as f:
+        pickle.dump(lookup, f)
diff --git a/tools/data/ava_kinetics/merge_annotations.py b/tools/data/ava_kinetics/merge_annotations.py
new file mode 100644
index 0000000000000000000000000000000000000000..51771ea3aaf6665dfbc53c36b6f6956f2a178cee
--- /dev/null
+++ b/tools/data/ava_kinetics/merge_annotations.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import pickle
+
+
+def check_file(path):
+    if os.path.isfile(path):
+        return
+    else:
+        path = path.split('/')
+        folder = '/'.join(path[:-1])
+        filename = path[-1]
+        info = '%s not found at %s' % (filename, folder)
+        raise FileNotFoundError(info)
+
+
+if __name__ == '__main__':
+    p = argparse.ArgumentParser()
+    p.add_argument(
+        '--avakinetics_root',
+        type=str,
+        default='../../../data/ava_kinetics',
+        help='the path to save ava-kinetics dataset')
+    root = p.parse_args().avakinetics_root
+
+    kinetics_annot = root + '/kinetics_train.csv'
+    ava_annot = root + '/annotations/ava_train_v2.2.csv'
+
+    check_file(kinetics_annot)
+    check_file(ava_annot)
+
+    with open(kinetics_annot) as f:
+        record = f.readlines()
+
+    with open(ava_annot) as f:
+        record += f.readlines()
+
+    with open(ava_annot, 'w') as f:
+        for line in record:
+            f.write(line)
+
+    kinetics_proposal = root + '/kinetics_proposal.pkl'
+    ava_proposal = root + '/annotations/' \
+                          'ava_dense_proposals_train.FAIR.recall_93.9.pkl'
+
+    check_file(kinetics_proposal)
+    check_file(ava_proposal)
+
+    lookup = pickle.load(open(kinetics_proposal, 'rb'))
+    lookup.update(pickle.load(open(ava_proposal, 'rb')))
+
+    with open(ava_proposal, 'wb') as f:
+        pickle.dump(lookup, f)
diff --git a/tools/data/ava_kinetics/prepare_annotation.py b/tools/data/ava_kinetics/prepare_annotation.py
new file mode 100644
index 0000000000000000000000000000000000000000..49e6874beec36303b5898bb682a74358915d6429
--- /dev/null
+++ b/tools/data/ava_kinetics/prepare_annotation.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import multiprocessing
+import os
+from collections import defaultdict
+
+FPS = 30
+
+
+def get_video_info(frame_folder):
+    folder_name = frame_folder.split('/')[-1]
+    filename = folder_name.split('_')
+    video_id = '_'.join(filename[:-2])
+    start = int(filename[-2])
+    length = len(os.listdir(frame_folder)) // FPS
+    return (video_id, start, start + length, folder_name)
+
+
+def get_avaialble_clips(frame_root, num_cpus):
+    folders = os.listdir(frame_root)
+    folders = ['%s/%s' % (frame_root, folder) for folder in folders]
+    pool = multiprocessing.Pool(num_cpus)
+    outputs = pool.map(get_video_info, folders)
+    lookup = defaultdict(list)
+    for record in outputs:
+        lookup[record[0]].append(record[1:])
+    return lookup
+
+
+def filter_train_list(kinetics_anotation_file, lookup):
+    with open(kinetics_anotation_file) as f:
+        anotated_frames = [i.split(',') for i in f.readlines()]
+        anotated_frames = [i for i in anotated_frames if len(i) == 7]
+
+    filtered = []
+    for line in anotated_frames:
+        if line[0] not in lookup:
+            continue
+        flag = False
+        for start, end, video_path in lookup[line[0]]:
+            if start < float(line[1]) < end:
+                flag = True
+                break
+        if flag is False:
+            continue
+
+        frame_idx, x1, y1, x2, y2, label = list(map(float, line[1:7]))
+        frame_idx, label = int(frame_idx), int(label)
+
+        string = (f'{video_path},{frame_idx},'
+                  f'{x1:.3f},{y1:.3f},{x2:.3f},{y2:.3f},{label},-1\n')
+
+        filtered.append(string)
+    return filtered
+
+
+if __name__ == '__main__':
+    p = argparse.ArgumentParser()
+    p.add_argument(
+        '--avakinetics_anotation',
+        type=str,
+        default='./ava_kinetics_v1_0',
+        help='the directory to ava-kinetics annotations')
+    p.add_argument(
+        '--num_workers',
+        type=int,
+        default=-1,
+        help='number of workers used for multiprocessing')
+    p.add_argument(
+        '--avakinetics_root',
+        type=str,
+        default='../../../data/ava_kinetics',
+        help='the path to save ava-kinetics videos')
+    args = p.parse_args()
+
+    if args.num_workers > 0:
+        num_workers = args.num_workers
+    else:
+        num_workers = max(multiprocessing.cpu_count() - 1, 1)
+
+    frame_root = args.avakinetics_root + '/rawframes/'
+    frame_root = os.path.abspath(frame_root)
+    lookup = get_avaialble_clips(frame_root, num_workers)
+
+    kinetics_train = args.avakinetics_anotation + '/kinetics_train_v1.0.csv'
+    filtered_list = filter_train_list(kinetics_train, lookup)
+
+    with open('%s/kinetics_train.csv' % args.avakinetics_root, 'w') as f:
+        for line in filtered_list:
+            f.write(line)
diff --git a/tools/data/ava_kinetics/softlink_ava.py b/tools/data/ava_kinetics/softlink_ava.py
new file mode 100644
index 0000000000000000000000000000000000000000..a377c5f672e9d765130122e455333665a1c9ff8f
--- /dev/null
+++ b/tools/data/ava_kinetics/softlink_ava.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+
+p = argparse.ArgumentParser()
+p.add_argument(
+    '--ava_root',
+    type=str,
+    default='../../../data/ava',
+    help='the path to save ava dataset')
+p.add_argument(
+    '--avakinetics_root',
+    type=str,
+    default='../../../data/ava_kinetics',
+    help='the path to save ava-kinetics dataset')
+args = p.parse_args()
+
+ava_frames = os.path.abspath(args.ava_root + '/rawframes/')
+kinetics_frames = os.path.abspath(args.avakinetics_root + '/rawframes/')
+
+ava_folders = os.listdir(ava_frames)
+for folder in ava_folders:
+    cmd = 'ln -s %s/%s %s/%s' % (ava_frames, folder, kinetics_frames, folder)
+    os.system(cmd)
diff --git a/tools/data/build_audio_features.py b/tools/data/build_audio_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe1a3b4dcd8b486cbcbc232b300a0d806f7c3788
--- /dev/null
+++ b/tools/data/build_audio_features.py
@@ -0,0 +1,320 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import os
+import os.path as osp
+import sys
+from multiprocessing import Pool
+
+import mmengine
+import numpy as np
+from scipy.io import wavfile
+
+try:
+    import librosa
+    import lws
+except ImportError:
+    print('Please import librosa, lws first.')
+
+sys.path.append('..')
+
+SILENCE_THRESHOLD = 2
+FMIN = 125
+FMAX = 7600
+FRAME_SHIFT_MS = None
+MIN_LEVEL_DB = -100
+REF_LEVEL_DB = 20
+RESCALING = True
+RESCALING_MAX = 0.999
+ALLOW_CLIPPING_IN_NORMALIZATION = True
+LOG_SCALE_MIN = -32.23619130191664
+NORM_AUDIO = True
+
+
+class AudioTools:
+    """All methods related to audio feature extraction. Code Reference:
+
+            <https://github.com/r9y9/deepvoice3_pytorch>`_,
+            <https://pypi.org/project/lws/1.2.6/>`_.
+
+    Args:
+        frame_rate (int): The frame rate per second of the video.
+            Defaults to 30.
+        sample_rate (int): The sample rate for audio sampling.
+            Defaults to 16000.
+        num_mels (int): Number of channels of the melspectrogram.
+            Defaults to 80.
+        fft_size (int): fft_size / sample_rate is window size.
+            Defaults to 1280.
+        hop_size (int): hop_size / sample_rate is step size.
+            Defaults to 320.
+    """
+
+    def __init__(self,
+                 frame_rate=30,
+                 sample_rate=16000,
+                 num_mels=80,
+                 fft_size=1280,
+                 hop_size=320,
+                 spectrogram_type='lws'):
+        self.frame_rate = frame_rate
+        self.sample_rate = sample_rate
+        self.silence_threshold = SILENCE_THRESHOLD
+        self.num_mels = num_mels
+        self.fmin = FMIN
+        self.fmax = FMAX
+        self.fft_size = fft_size
+        self.hop_size = hop_size
+        self.frame_shift_ms = FRAME_SHIFT_MS
+        self.min_level_db = MIN_LEVEL_DB
+        self.ref_level_db = REF_LEVEL_DB
+        self.rescaling = RESCALING
+        self.rescaling_max = RESCALING_MAX
+        self.allow_clipping_in_normalization = ALLOW_CLIPPING_IN_NORMALIZATION
+        self.log_scale_min = LOG_SCALE_MIN
+        self.norm_audio = NORM_AUDIO
+        self.spectrogram_type = spectrogram_type
+        assert spectrogram_type in ['lws', 'librosa']
+
+    def load_wav(self, path):
+        """Load an audio file into numpy array."""
+        return librosa.core.load(path, sr=self.sample_rate)[0]
+
+    @staticmethod
+    def audio_normalize(samples, desired_rms=0.1, eps=1e-4):
+        """RMS normalize the audio data."""
+        rms = np.maximum(eps, np.sqrt(np.mean(samples**2)))
+        samples = samples * (desired_rms / rms)
+        return samples
+
+    def generate_spectrogram_magphase(self, audio, with_phase=False):
+        """Separate a complex-valued spectrogram D into its magnitude (S)
+
+            and phase (P) components, so that D = S * P.
+
+        Args:
+            audio (np.ndarray): The input audio signal.
+            with_phase (bool): Determines whether to output the
+                phase components. Default: False.
+
+        Returns:
+            np.ndarray: magnitude and phase component of the complex-valued
+                spectrogram.
+        """
+        spectro = librosa.core.stft(
+            audio,
+            hop_length=self.get_hop_size(),
+            n_fft=self.fft_size,
+            center=True)
+        spectro_mag, spectro_phase = librosa.core.magphase(spectro)
+        spectro_mag = np.expand_dims(spectro_mag, axis=0)
+        if with_phase:
+            spectro_phase = np.expand_dims(np.angle(spectro_phase), axis=0)
+            return spectro_mag, spectro_phase
+
+        return spectro_mag
+
+    def save_wav(self, wav, path):
+        """Save the wav to disk."""
+        # 32767 = (2 ^ 15 - 1) maximum of int16
+        wav *= 32767 / max(0.01, np.max(np.abs(wav)))
+        wavfile.write(path, self.sample_rate, wav.astype(np.int16))
+
+    def trim(self, quantized):
+        """Trim the audio wavfile."""
+        start, end = self.start_and_end_indices(quantized,
+                                                self.silence_threshold)
+        return quantized[start:end]
+
+    def adjust_time_resolution(self, quantized, mel):
+        """Adjust time resolution by repeating features.
+
+        Args:
+            quantized (np.ndarray): (T,)
+            mel (np.ndarray): (N, D)
+
+        Returns:
+            tuple: Tuple of (T,) and (T, D)
+        """
+        assert quantized.ndim == 1
+        assert mel.ndim == 2
+
+        upsample_factor = quantized.size // mel.shape[0]
+        mel = np.repeat(mel, upsample_factor, axis=0)
+        n_pad = quantized.size - mel.shape[0]
+        if n_pad != 0:
+            assert n_pad > 0
+            mel = np.pad(
+                mel, [(0, n_pad), (0, 0)], mode='constant', constant_values=0)
+
+        # trim
+        start, end = self.start_and_end_indices(quantized,
+                                                self.silence_threshold)
+
+        return quantized[start:end], mel[start:end, :]
+
+    @staticmethod
+    def start_and_end_indices(quantized, silence_threshold=2):
+        """Trim the audio file when reaches the silence threshold."""
+        for start in range(quantized.size):
+            if abs(quantized[start] - 127) > silence_threshold:
+                break
+        for end in range(quantized.size - 1, 1, -1):
+            if abs(quantized[end] - 127) > silence_threshold:
+                break
+
+        assert abs(quantized[start] - 127) > silence_threshold
+        assert abs(quantized[end] - 127) > silence_threshold
+
+        return start, end
+
+    def melspectrogram(self, y):
+        """Generate the melspectrogram."""
+        D = self._lws_processor().stft(y).T
+        S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
+        if not self.allow_clipping_in_normalization:
+            assert S.max() <= 0 and S.min() - self.min_level_db >= 0
+        return self._normalize(S)
+
+    def get_hop_size(self):
+        """Calculate the hop size."""
+        hop_size = self.hop_size
+        if hop_size is None:
+            assert self.frame_shift_ms is not None
+            hop_size = int(self.frame_shift_ms / 1000 * self.sample_rate)
+        return hop_size
+
+    def _lws_processor(self):
+        """Perform local weighted sum.
+
+        Please refer to <https://pypi.org/project/lws/1.2.6/>`_.
+        """
+        return lws.lws(self.fft_size, self.get_hop_size(), mode='speech')
+
+    @staticmethod
+    def lws_num_frames(length, fsize, fshift):
+        """Compute number of time frames of lws spectrogram.
+
+        Please refer to <https://pypi.org/project/lws/1.2.6/>`_.
+        """
+        pad = (fsize - fshift)
+        if length % fshift == 0:
+            M = (length + pad * 2 - fsize) // fshift + 1
+        else:
+            M = (length + pad * 2 - fsize) // fshift + 2
+        return M
+
+    def lws_pad_lr(self, x, fsize, fshift):
+        """Compute left and right padding lws internally uses.
+
+        Please refer to <https://pypi.org/project/lws/1.2.6/>`_.
+        """
+        M = self.lws_num_frames(len(x), fsize, fshift)
+        pad = (fsize - fshift)
+        T = len(x) + 2 * pad
+        r = (M - 1) * fshift + fsize - T
+        return pad, pad + r
+
+    def _linear_to_mel(self, spectrogram):
+        """Warp linear scale spectrograms to the mel scale.
+
+        Please refer to <https://github.com/r9y9/deepvoice3_pytorch>`_
+        """
+        global _mel_basis
+        _mel_basis = self._build_mel_basis()
+        return np.dot(_mel_basis, spectrogram)
+
+    def _build_mel_basis(self):
+        """Build mel filters.
+
+        Please refer to <https://github.com/r9y9/deepvoice3_pytorch>`_
+        """
+        assert self.fmax <= self.sample_rate // 2
+        return librosa.filters.mel(
+            self.sample_rate,
+            self.fft_size,
+            fmin=self.fmin,
+            fmax=self.fmax,
+            n_mels=self.num_mels)
+
+    def _amp_to_db(self, x):
+        min_level = np.exp(self.min_level_db / 20 * np.log(10))
+        return 20 * np.log10(np.maximum(min_level, x))
+
+    @staticmethod
+    def _db_to_amp(x):
+        return np.power(10.0, x * 0.05)
+
+    def _normalize(self, S):
+        return np.clip((S - self.min_level_db) / -self.min_level_db, 0, 1)
+
+    def _denormalize(self, S):
+        return (np.clip(S, 0, 1) * -self.min_level_db) + self.min_level_db
+
+    def read_audio(self, audio_path):
+        wav = self.load_wav(audio_path)
+        if self.norm_audio:
+            wav = self.audio_normalize(wav)
+        else:
+            wav = wav / np.abs(wav).max()
+
+        return wav
+
+    def audio_to_spectrogram(self, wav):
+        if self.spectrogram_type == 'lws':
+            spectrogram = self.melspectrogram(wav).astype(np.float32).T
+        elif self.spectrogram_type == 'librosa':
+            spectrogram = self.generate_spectrogram_magphase(wav)
+        return spectrogram
+
+
+def extract_audio_feature(wav_path, audio_tools, mel_out_dir):
+    file_name, _ = osp.splitext(osp.basename(wav_path))
+    # Write the spectrograms to disk:
+    mel_filename = os.path.join(mel_out_dir, file_name + '.npy')
+    if not os.path.exists(mel_filename):
+        try:
+            wav = audio_tools.read_audio(wav_path)
+
+            spectrogram = audio_tools.audio_to_spectrogram(wav)
+
+            np.save(
+                mel_filename,
+                spectrogram.astype(np.float32),
+                allow_pickle=False)
+
+        except BaseException:
+            print(f'Read audio [{wav_path}] failed.')
+
+
+if __name__ == '__main__':
+    audio_tools = AudioTools(
+        fft_size=512, hop_size=256)  # window_size:32ms hop_size:16ms
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('audio_home_path', type=str)
+    parser.add_argument('spectrogram_save_path', type=str)
+    parser.add_argument('--level', type=int, default=1)
+    parser.add_argument('--ext', default='m4a')
+    parser.add_argument('--num-workers', type=int, default=4)
+    parser.add_argument('--part', type=str, default='1/1')
+    args = parser.parse_args()
+
+    mmengine.mkdir_or_exist(args.spectrogram_save_path)
+
+    files = glob.glob(args.audio_home_path + '/*' * args.level + '.' +
+                      args.ext)
+    print(f'found {len(files)} files.')
+    files = sorted(files)
+    if args.part is not None:
+        [this_part, num_parts] = [int(i) for i in args.part.split('/')]
+        part_len = len(files) // num_parts
+
+    p = Pool(args.num_workers)
+    for file in files[part_len * (this_part - 1):(
+            part_len * this_part) if this_part != num_parts else len(files)]:
+        p.apply_async(
+            extract_audio_feature,
+            args=(file, audio_tools, args.spectrogram_save_path))
+    p.close()
+    p.join()
diff --git a/tools/data/build_file_list.py b/tools/data/build_file_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00fd382c9891c2b33ca17c6bee26b3e8874de4d
--- /dev/null
+++ b/tools/data/build_file_list.py
@@ -0,0 +1,269 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import json
+import os.path as osp
+import random
+
+from mmengine.runner import set_random_seed
+
+from tools.data.anno_txt2json import lines2dictlist
+from tools.data.parse_file_list import (parse_directory, parse_diving48_splits,
+                                        parse_hmdb51_split,
+                                        parse_jester_splits,
+                                        parse_kinetics_splits,
+                                        parse_mit_splits, parse_mmit_splits,
+                                        parse_sthv1_splits, parse_sthv2_splits,
+                                        parse_ucf101_splits)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Build file list')
+    parser.add_argument(
+        'dataset',
+        type=str,
+        choices=[
+            'ucf101', 'kinetics400', 'kinetics600', 'kinetics700', 'thumos14',
+            'sthv1', 'sthv2', 'mit', 'mmit', 'activitynet', 'hmdb51', 'jester',
+            'diving48'
+        ],
+        help='dataset to be built file list')
+    parser.add_argument(
+        'src_folder', type=str, help='root directory for the frames or videos')
+    parser.add_argument(
+        '--rgb-prefix', type=str, default='img_', help='prefix of rgb frames')
+    parser.add_argument(
+        '--flow-x-prefix',
+        type=str,
+        default='flow_x_',
+        help='prefix of flow x frames')
+    parser.add_argument(
+        '--flow-y-prefix',
+        type=str,
+        default='flow_y_',
+        help='prefix of flow y frames')
+    parser.add_argument(
+        '--num-split',
+        type=int,
+        default=3,
+        help='number of split to file list')
+    parser.add_argument(
+        '--subset',
+        type=str,
+        default='train',
+        choices=['train', 'val', 'test'],
+        help='subset to generate file list')
+    parser.add_argument(
+        '--level',
+        type=int,
+        default=2,
+        choices=[1, 2],
+        help='directory level of data')
+    parser.add_argument(
+        '--format',
+        type=str,
+        default='rawframes',
+        choices=['rawframes', 'videos'],
+        help='data format')
+    parser.add_argument(
+        '--out-root-path',
+        type=str,
+        default='data/',
+        help='root path for output')
+    parser.add_argument(
+        '--output-format',
+        type=str,
+        default='txt',
+        choices=['txt', 'json'],
+        help='built file list format')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--shuffle',
+        action='store_true',
+        default=False,
+        help='whether to shuffle the file list')
+    args = parser.parse_args()
+
+    return args
+
+
+def build_file_list(splits, frame_info, shuffle=False):
+    """Build file list for a certain data split.
+
+    Args:
+        splits (tuple): Data split to generate file list.
+        frame_info (dict): Dict mapping from frames to path. e.g.,
+            'Skiing/v_Skiing_g18_c02': ('data/ucf101/rawframes/Skiing/v_Skiing_g18_c02', 0, 0).  # noqa: E501
+        shuffle (bool): Whether to shuffle the file list.
+
+    Returns:
+        tuple: RGB file list for training and testing, together with
+            Flow file list for training and testing.
+    """
+
+    def build_list(split):
+        """Build RGB and Flow file list with a given split.
+
+        Args:
+            split (list): Split to be generate file list.
+
+        Returns:
+            tuple[list, list]: (rgb_list, flow_list), rgb_list is the
+                generated file list for rgb, flow_list is the generated
+                file list for flow.
+        """
+        rgb_list, flow_list = list(), list()
+        for item in split:
+            if item[0] not in frame_info:
+                continue
+            if frame_info[item[0]][1] > 0:
+                # rawframes
+                rgb_cnt = frame_info[item[0]][1]
+                flow_cnt = frame_info[item[0]][2]
+                if isinstance(item[1], int):
+                    rgb_list.append(f'{item[0]} {rgb_cnt} {item[1]}\n')
+                    flow_list.append(f'{item[0]} {flow_cnt} {item[1]}\n')
+                elif isinstance(item[1], list):
+                    # only for multi-label datasets like mmit
+                    rgb_list.append(f'{item[0]} {rgb_cnt} ' +
+                                    ' '.join([str(digit)
+                                              for digit in item[1]]) + '\n')
+                    rgb_list.append(f'{item[0]} {flow_cnt} ' +
+                                    ' '.join([str(digit)
+                                              for digit in item[1]]) + '\n')
+                else:
+                    raise ValueError(
+                        'frame_info should be ' +
+                        '[`video`(str), `label`(int)|`labels(list[int])`')
+            else:
+                # videos
+                if isinstance(item[1], int):
+                    rgb_list.append(f'{frame_info[item[0]][0]} {item[1]}\n')
+                    flow_list.append(f'{frame_info[item[0]][0]} {item[1]}\n')
+                elif isinstance(item[1], list):
+                    # only for multi-label datasets like mmit
+                    rgb_list.append(f'{frame_info[item[0]][0]} ' +
+                                    ' '.join([str(digit)
+                                              for digit in item[1]]) + '\n')
+                    flow_list.append(
+                        f'{frame_info[item[0]][0]} ' +
+                        ' '.join([str(digit) for digit in item[1]]) + '\n')
+                else:
+                    raise ValueError(
+                        'frame_info should be ' +
+                        '[`video`(str), `label`(int)|`labels(list[int])`')
+        if shuffle:
+            random.shuffle(rgb_list)
+            random.shuffle(flow_list)
+        return rgb_list, flow_list
+
+    train_rgb_list, train_flow_list = build_list(splits[0])
+    test_rgb_list, test_flow_list = build_list(splits[1])
+    return (train_rgb_list, test_rgb_list), (train_flow_list, test_flow_list)
+
+
+def main():
+    args = parse_args()
+
+    if args.seed is not None:
+        print(f'Set random seed to {args.seed}')
+        set_random_seed(args.seed)
+
+    if args.format == 'rawframes':
+        frame_info = parse_directory(
+            args.src_folder,
+            rgb_prefix=args.rgb_prefix,
+            flow_x_prefix=args.flow_x_prefix,
+            flow_y_prefix=args.flow_y_prefix,
+            level=args.level)
+    elif args.format == 'videos':
+        if args.level == 1:
+            # search for one-level directory
+            video_list = glob.glob(osp.join(args.src_folder, '*'))
+        elif args.level == 2:
+            # search for two-level directory
+            video_list = glob.glob(osp.join(args.src_folder, '*', '*'))
+        else:
+            raise ValueError(f'level must be 1 or 2, but got {args.level}')
+        frame_info = {}
+        for video in video_list:
+            video_path = osp.relpath(video, args.src_folder)
+            # video_id: (video_relative_path, -1, -1)
+            frame_info[osp.splitext(video_path)[0]] = (video_path, -1, -1)
+    else:
+        raise NotImplementedError('only rawframes and videos are supported')
+
+    if args.dataset == 'ucf101':
+        splits = parse_ucf101_splits(args.level)
+    elif args.dataset == 'sthv1':
+        splits = parse_sthv1_splits(args.level)
+    elif args.dataset == 'sthv2':
+        splits = parse_sthv2_splits(args.level)
+    elif args.dataset == 'mit':
+        splits = parse_mit_splits()
+    elif args.dataset == 'mmit':
+        splits = parse_mmit_splits()
+    elif args.dataset in ['kinetics400', 'kinetics600', 'kinetics700']:
+        splits = parse_kinetics_splits(args.level, args.dataset)
+    elif args.dataset == 'hmdb51':
+        splits = parse_hmdb51_split(args.level)
+    elif args.dataset == 'jester':
+        splits = parse_jester_splits(args.level)
+    elif args.dataset == 'diving48':
+        splits = parse_diving48_splits()
+    else:
+        raise ValueError(
+            f"Supported datasets are 'ucf101, sthv1, sthv2', 'jester', "
+            f"'mmit', 'mit', 'kinetics400', 'kinetics600', 'kinetics700', but "
+            f'got {args.dataset}')
+
+    assert len(splits) == args.num_split
+
+    out_path = args.out_root_path + args.dataset
+
+    if len(splits) > 1:
+        for i, split in enumerate(splits):
+            file_lists = build_file_list(
+                split, frame_info, shuffle=args.shuffle)
+            train_name = f'{args.dataset}_train_split_{i+1}_{args.format}.txt'
+            val_name = f'{args.dataset}_val_split_{i+1}_{args.format}.txt'
+            if args.output_format == 'txt':
+                with open(osp.join(out_path, train_name), 'w') as f:
+                    f.writelines(file_lists[0][0])
+                with open(osp.join(out_path, val_name), 'w') as f:
+                    f.writelines(file_lists[0][1])
+            elif args.output_format == 'json':
+                train_list = lines2dictlist(file_lists[0][0], args.format)
+                val_list = lines2dictlist(file_lists[0][1], args.format)
+                train_name = train_name.replace('.txt', '.json')
+                val_name = val_name.replace('.txt', '.json')
+                with open(osp.join(out_path, train_name), 'w') as f:
+                    json.dump(train_list, f)
+                with open(osp.join(out_path, val_name), 'w') as f:
+                    json.dump(val_list, f)
+    else:
+        lists = build_file_list(splits[0], frame_info, shuffle=args.shuffle)
+
+        if args.subset == 'train':
+            ind = 0
+        elif args.subset == 'val':
+            ind = 1
+        elif args.subset == 'test':
+            ind = 2
+        else:
+            raise ValueError(f"subset must be in ['train', 'val', 'test'], "
+                             f'but got {args.subset}.')
+
+        filename = f'{args.dataset}_{args.subset}_list_{args.format}.txt'
+        if args.output_format == 'txt':
+            with open(osp.join(out_path, filename), 'w') as f:
+                f.writelines(lists[0][ind])
+        elif args.output_format == 'json':
+            data_list = lines2dictlist(lists[0][ind], args.format)
+            filename = filename.replace('.txt', '.json')
+            with open(osp.join(out_path, filename), 'w') as f:
+                json.dump(data_list, f)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/data/build_rawframes.py b/tools/data/build_rawframes.py
new file mode 100644
index 0000000000000000000000000000000000000000..0613fa0cb90425c62a2eca336267a72bcb742d08
--- /dev/null
+++ b/tools/data/build_rawframes.py
@@ -0,0 +1,278 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import os
+import os.path as osp
+import sys
+import warnings
+from multiprocessing import Lock, Pool
+
+import mmcv
+import numpy as np
+
+
+def extract_frame(vid_item):
+    """Generate optical flow using dense flow.
+
+    Args:
+        vid_item (list): Video item containing video full path,
+            video (short) path, video id.
+
+    Returns:
+        bool: Whether generate optical flow successfully.
+    """
+    full_path, vid_path, vid_id, method, task, report_file = vid_item
+    if '/' in vid_path:
+        act_name = osp.basename(osp.dirname(vid_path))
+        out_full_path = osp.join(args.out_dir, act_name)
+    else:
+        out_full_path = args.out_dir
+
+    run_success = -1
+
+    if task == 'rgb':
+        if args.use_opencv:
+            # Not like using denseflow,
+            # Use OpenCV will not make a sub directory with the video name
+            try:
+                video_name = osp.splitext(osp.basename(vid_path))[0]
+                out_full_path = osp.join(out_full_path, video_name)
+
+                vr = mmcv.VideoReader(full_path)
+                for i, vr_frame in enumerate(vr):
+                    if vr_frame is not None:
+                        w, h, _ = np.shape(vr_frame)
+                        if args.new_short == 0:
+                            if args.new_width == 0 or args.new_height == 0:
+                                # Keep original shape
+                                out_img = vr_frame
+                            else:
+                                out_img = mmcv.imresize(
+                                    vr_frame,
+                                    (args.new_width, args.new_height))
+                        else:
+                            if min(h, w) == h:
+                                new_h = args.new_short
+                                new_w = int((new_h / h) * w)
+                            else:
+                                new_w = args.new_short
+                                new_h = int((new_w / w) * h)
+                            out_img = mmcv.imresize(vr_frame, (new_h, new_w))
+                        mmcv.imwrite(out_img,
+                                     f'{out_full_path}/img_{i + 1:05d}.jpg')
+                    else:
+                        warnings.warn(
+                            'Length inconsistent!'
+                            f'Early stop with {i + 1} out of {len(vr)} frames.'
+                        )
+                        break
+                run_success = 0
+            except Exception:
+                run_success = -1
+        else:
+            if args.new_short == 0:
+                cmd = osp.join(
+                    f"denseflow '{full_path}' -b=20 -s=0 -o='{out_full_path}'"
+                    f' -nw={args.new_width} -nh={args.new_height} -v')
+            else:
+                cmd = osp.join(
+                    f"denseflow '{full_path}' -b=20 -s=0 -o='{out_full_path}'"
+                    f' -ns={args.new_short} -v')
+            run_success = os.system(cmd)
+    elif task == 'flow':
+        if args.input_frames:
+            if args.new_short == 0:
+                cmd = osp.join(
+                    f"denseflow '{full_path}' -a={method} -b=20 -s=1 -o='{out_full_path}'"  # noqa: E501
+                    f' -nw={args.new_width} --nh={args.new_height} -v --if')
+            else:
+                cmd = osp.join(
+                    f"denseflow '{full_path}' -a={method} -b=20 -s=1 -o='{out_full_path}'"  # noqa: E501
+                    f' -ns={args.new_short} -v --if')
+        else:
+            if args.new_short == 0:
+                cmd = osp.join(
+                    f"denseflow '{full_path}' -a={method} -b=20 -s=1 -o='{out_full_path}'"  # noqa: E501
+                    f' -nw={args.new_width} --nh={args.new_height} -v')
+            else:
+                cmd = osp.join(
+                    f"denseflow '{full_path}' -a={method} -b=20 -s=1 -o='{out_full_path}'"  # noqa: E501
+                    f' -ns={args.new_short} -v')
+        run_success = os.system(cmd)
+    else:
+        if args.new_short == 0:
+            cmd_rgb = osp.join(
+                f"denseflow '{full_path}' -b=20 -s=0 -o='{out_full_path}'"
+                f' -nw={args.new_width} -nh={args.new_height} -v')
+            cmd_flow = osp.join(
+                f"denseflow '{full_path}' -a={method} -b=20 -s=1 -o='{out_full_path}'"  # noqa: E501
+                f' -nw={args.new_width} -nh={args.new_height} -v')
+        else:
+            cmd_rgb = osp.join(
+                f"denseflow '{full_path}' -b=20 -s=0 -o='{out_full_path}'"
+                f' -ns={args.new_short} -v')
+            cmd_flow = osp.join(
+                f"denseflow '{full_path}' -a={method} -b=20 -s=1 -o='{out_full_path}'"  # noqa: E501
+                f' -ns={args.new_short} -v')
+        run_success_rgb = os.system(cmd_rgb)
+        run_success_flow = os.system(cmd_flow)
+        if run_success_flow == 0 and run_success_rgb == 0:
+            run_success = 0
+
+    if run_success == 0:
+        print(f'{task} {vid_id} {vid_path} {method} done')
+        sys.stdout.flush()
+
+        lock.acquire()
+        with open(report_file, 'a') as f:
+            line = full_path + '\n'
+            f.write(line)
+        lock.release()
+    else:
+        print(f'{task} {vid_id} {vid_path} {method} got something wrong')
+        sys.stdout.flush()
+
+    return True
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='extract optical flows')
+    parser.add_argument('src_dir', type=str, help='source video directory')
+    parser.add_argument('out_dir', type=str, help='output rawframe directory')
+    parser.add_argument(
+        '--task',
+        type=str,
+        default='flow',
+        choices=['rgb', 'flow', 'both'],
+        help='which type of frames to be extracted')
+    parser.add_argument(
+        '--level',
+        type=int,
+        choices=[1, 2],
+        default=2,
+        help='directory level of data')
+    parser.add_argument(
+        '--num-worker',
+        type=int,
+        default=8,
+        help='number of workers to build rawframes')
+    parser.add_argument(
+        '--flow-type',
+        type=str,
+        default=None,
+        choices=[None, 'tvl1', 'warp_tvl1', 'farn', 'brox'],
+        help='flow type to be generated')
+    parser.add_argument(
+        '--out-format',
+        type=str,
+        default='jpg',
+        choices=['jpg', 'h5', 'png'],
+        help='output format')
+    parser.add_argument(
+        '--ext',
+        type=str,
+        default='avi',
+        choices=['avi', 'mp4', 'webm'],
+        help='video file extensions')
+    parser.add_argument(
+        '--mixed-ext',
+        action='store_true',
+        help='process video files with mixed extensions')
+    parser.add_argument(
+        '--new-width', type=int, default=0, help='resize image width')
+    parser.add_argument(
+        '--new-height', type=int, default=0, help='resize image height')
+    parser.add_argument(
+        '--new-short',
+        type=int,
+        default=0,
+        help='resize image short side length keeping ratio')
+    parser.add_argument('--num-gpu', type=int, default=8, help='number of GPU')
+    parser.add_argument(
+        '--resume',
+        action='store_true',
+        default=False,
+        help='resume optical flow extraction instead of overwriting')
+    parser.add_argument(
+        '--use-opencv',
+        action='store_true',
+        help='Whether to use opencv to extract rgb frames')
+    parser.add_argument(
+        '--input-frames',
+        action='store_true',
+        help='Whether to extract flow frames based on rgb frames')
+    parser.add_argument(
+        '--report-file',
+        type=str,
+        default='build_report.txt',
+        help='report to record files which have been successfully processed')
+    args = parser.parse_args()
+
+    return args
+
+
+def init(lock_):
+    global lock
+    lock = lock_
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if not osp.isdir(args.out_dir):
+        print(f'Creating folder: {args.out_dir}')
+        os.makedirs(args.out_dir)
+
+    if args.level == 2:
+        classes = os.listdir(args.src_dir)
+        for classname in classes:
+            new_dir = osp.join(args.out_dir, classname)
+            if not osp.isdir(new_dir):
+                print(f'Creating folder: {new_dir}')
+                os.makedirs(new_dir)
+
+    if args.input_frames:
+        print('Reading rgb frames from folder: ', args.src_dir)
+        fullpath_list = glob.glob(args.src_dir + '/*' * args.level)
+        print('Total number of rgb frame folders found: ', len(fullpath_list))
+    else:
+        print('Reading videos from folder: ', args.src_dir)
+        if args.mixed_ext:
+            print('Extension of videos is mixed')
+            fullpath_list = glob.glob(args.src_dir + '/*' * args.level)
+        else:
+            print('Extension of videos: ', args.ext)
+            fullpath_list = glob.glob(args.src_dir + '/*' * args.level + '.' +
+                                      args.ext)
+        print('Total number of videos found: ', len(fullpath_list))
+
+    if args.resume:
+        done_fullpath_list = []
+        with open(args.report_file) as f:
+            for line in f:
+                if line == '\n':
+                    continue
+                done_full_path = line.strip().split()[0]
+                done_fullpath_list.append(done_full_path)
+        done_fullpath_list = set(done_fullpath_list)
+        fullpath_list = list(set(fullpath_list).difference(done_fullpath_list))
+
+    if args.level == 2:
+        vid_list = list(
+            map(
+                lambda p: osp.join(
+                    osp.basename(osp.dirname(p)), osp.basename(p)),
+                fullpath_list))
+    elif args.level == 1:
+        vid_list = list(map(osp.basename, fullpath_list))
+
+    lock = Lock()
+    pool = Pool(args.num_worker, initializer=init, initargs=(lock, ))
+    pool.map(
+        extract_frame,
+        zip(fullpath_list, vid_list, range(len(vid_list)),
+            len(vid_list) * [args.flow_type],
+            len(vid_list) * [args.task],
+            len(vid_list) * [args.report_file]))
+    pool.close()
+    pool.join()
diff --git a/tools/data/build_videos.py b/tools/data/build_videos.py
new file mode 100644
index 0000000000000000000000000000000000000000..6540157dda9fd4d4a27c0981985ffd8728289807
--- /dev/null
+++ b/tools/data/build_videos.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import os
+import os.path as osp
+import sys
+from multiprocessing import Pool
+
+
+def encode_video(frame_dir_item):
+    """Encode frames to video using ffmpeg.
+
+    Args:
+        frame_dir_item (list): Rawframe item containing raw frame directory
+            full path, rawframe directory (short) path, rawframe directory id.
+
+    Returns:
+        bool: Whether synthesize video successfully.
+    """
+    full_path, frame_dir_path, frame_dir_id = frame_dir_item
+    out_full_path = args.out_dir
+
+    img_name_tmpl = args.filename_tmpl + '.' + args.in_format
+    img_path = osp.join(full_path, img_name_tmpl)
+
+    out_vid_name = frame_dir_path + '.' + args.ext
+    out_vid_path = osp.join(out_full_path, out_vid_name)
+
+    cmd = osp.join(
+        f"ffmpeg -start_number {args.start_idx} -r {args.fps} -i '{img_path}' "
+        f"-vcodec {args.vcodec} '{out_vid_path}'")
+    os.system(cmd)
+
+    print(f'{frame_dir_id} {frame_dir_path} done')
+    sys.stdout.flush()
+    return True
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='synthesize videos')
+    parser.add_argument('src_dir', type=str, help='source rawframe directory')
+    parser.add_argument('out_dir', type=str, help='output video directory')
+    parser.add_argument(
+        '--fps', type=int, default=30, help='fps of videos to be synthesized')
+    parser.add_argument(
+        '--level',
+        type=int,
+        choices=[1, 2],
+        default=2,
+        help='directory level of data')
+    parser.add_argument(
+        '--num-worker',
+        type=int,
+        default=8,
+        help='number of workers to build videos')
+    parser.add_argument(
+        '--in-format',
+        type=str,
+        default='jpg',
+        choices=['jpg', 'png'],
+        help='input format')
+    parser.add_argument(
+        '--start-idx', type=int, default=0, help='starting index of rawframes')
+    parser.add_argument(
+        '--filename-tmpl',
+        type=str,
+        default='img_%05d',
+        help='filename template of rawframes')
+    parser.add_argument(
+        '--vcodec', type=str, default='mpeg4', help='coding method of videos')
+    parser.add_argument(
+        '--ext',
+        type=str,
+        default='mp4',
+        choices=['mp4', 'avi'],
+        help='video file extensions')
+    parser.add_argument('--num-gpu', type=int, default=8, help='number of GPU')
+    parser.add_argument(
+        '--resume',
+        action='store_true',
+        default=False,
+        help='resume optical flow extraction instead of overwriting')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if not osp.isdir(args.out_dir):
+        print(f'Creating folder: {args.out_dir}')
+        os.makedirs(args.out_dir)
+
+    if args.level == 2:
+        classes = os.listdir(args.src_dir)
+        for classname in classes:
+            new_dir = osp.join(args.out_dir, classname)
+            if not osp.isdir(new_dir):
+                print(f'Creating folder: {new_dir}')
+                os.makedirs(new_dir)
+
+    print('Reading rgb frames from folder: ', args.src_dir)
+    print('Input format of rgb frames: ', args.in_format)
+    fullpath_list = glob.glob(args.src_dir + '/*' * args.level)
+    done_fullpath_list = glob.glob(args.src_dir + '/*' * args.level + '.' +
+                                   args.ext)
+    print('Total number of rgb frame folders found: ', len(fullpath_list))
+
+    if args.resume:
+        fullpath_list = set(fullpath_list).difference(set(done_fullpath_list))
+        fullpath_list = list(fullpath_list)
+        print('Resuming. number of videos to be synthesized: ',
+              len(fullpath_list))
+
+    if args.level == 2:
+        frame_dir_list = list(
+            map(
+                lambda p: osp.join(
+                    osp.basename(osp.dirname(p)), osp.basename(p)),
+                fullpath_list))
+    elif args.level == 1:
+        frame_dir_list = list(map(osp.basename, fullpath_list))
+
+    pool = Pool(args.num_worker)
+    pool.map(encode_video,
+             zip(fullpath_list, frame_dir_list, range(len(frame_dir_list))))
diff --git a/tools/data/charades-sta/README.md b/tools/data/charades-sta/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..788dd51472f9ed663e7964ec1ad96878f597d1e1
--- /dev/null
+++ b/tools/data/charades-sta/README.md
@@ -0,0 +1,59 @@
+# Preparing AVA
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{gao2017tall,
+  title={Tall: Temporal activity localization via language query},
+  author={Gao, Jiyang and Sun, Chen and Yang, Zhenheng and Nevatia, Ram},
+  booktitle={Proceedings of the IEEE international conference on computer vision},
+  pages={5267--5275},
+  year={2017}
+}
+
+@inproceedings{DRN2020CVPR,
+  author    = {Runhao, Zeng and Haoming, Xu and Wenbing, Huang and Peihao, Chen and Mingkui, Tan and Chuang Gan},
+  title     = {Dense Regression Network for Video Grounding},
+  booktitle = {CVPR},
+  year      = {2020},
+}
+```
+
+Charades-STA is a new dataset built on top of Charades by adding sentence temporal annotations. It is introduced by Gao et al. in `TALL: Temporal Activity Localization via Language Query`. Currently, we only support C3D features from `Dense Regression Network for Video Grounding`.
+
+## Step 1. Prepare Annotations
+
+First of all, you can run the following script to prepare annotations from the official repository of DRN:
+
+```shell
+bash download_annotations.sh
+```
+
+## Step 2. Prepare C3D features
+
+After the first step, you should be at `${MMACTION2}/data/CharadesSTA/`. Download the C3D features following the [official command](https://github.com/Alvin-Zeng/DRN/tree/master#download-features) to the current directory `${MMACTION2}/data/CharadesSTA/`.
+
+After finishing the two steps, the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── CharadesSTA
+│   │   ├── C3D_unit16_overlap0.5_merged
+│   │   |   ├── 001YG.pt
+│   │   |   ├── 003WS.pt
+│   │   |   ├── 004QE.pt
+│   │   |   ├── 00607.pt
+│   │   |   ├── ...
+│   │   ├── Charades_duration.json
+│   │   ├── Charades_fps_dict.json
+│   │   ├── Charades_frames_info.json
+│   │   ├── Charades_sta_test.txt
+│   │   ├── Charades_sta_train.txt
+│   │   ├── Charades_word2id.json
+```
diff --git a/tools/data/charades-sta/download_annotations.sh b/tools/data/charades-sta/download_annotations.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9755f81a958f77cff4408cbb591d0369d0e0477b
--- /dev/null
+++ b/tools/data/charades-sta/download_annotations.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+DATA_DIR="../../../data/CharadesSTA/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+cd ${DATA_DIR}
+
+URL="https://raw.githubusercontent.com/Alvin-Zeng/DRN/master/data/dataset/Charades"
+wget ${URL}/Charades_frames_info.json
+wget ${URL}/Charades_duration.json
+wget ${URL}/Charades_fps_dict.json
+wget ${URL}/Charades_sta_test.txt
+wget ${URL}/Charades_sta_train.txt
+wget ${URL}/Charades_word2id.json
diff --git a/tools/data/denormalize_proposal_file.py b/tools/data/denormalize_proposal_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e07832765e92a8ed04b1328ad5e333e5257091d
--- /dev/null
+++ b/tools/data/denormalize_proposal_file.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+from mmaction.localization import load_localize_proposal_file
+from tools.data.parse_file_list import parse_directory
+
+
+def process_norm_proposal_file(norm_proposal_file, frame_dict):
+    """Process the normalized proposal file and denormalize it.
+
+    Args:
+        norm_proposal_file (str): Name of normalized proposal file.
+        frame_dict (dict): Information of frame folders.
+    """
+    proposal_file = norm_proposal_file.replace('normalized_', '')
+    norm_proposals = load_localize_proposal_file(norm_proposal_file)
+
+    processed_proposal_list = []
+    for idx, norm_proposal in enumerate(norm_proposals):
+        video_id = norm_proposal[0]
+        frame_info = frame_dict[video_id]
+        num_frames = frame_info[1]
+        frame_path = osp.basename(frame_info[0])
+
+        gt = [[
+            int(x[0]),
+            int(float(x[1]) * num_frames),
+            int(float(x[2]) * num_frames)
+        ] for x in norm_proposal[2]]
+
+        proposal = [[
+            int(x[0]),
+            float(x[1]),
+            float(x[2]),
+            int(float(x[3]) * num_frames),
+            int(float(x[4]) * num_frames)
+        ] for x in norm_proposal[3]]
+
+        gt_dump = '\n'.join(['{} {} {}'.format(*x) for x in gt])
+        gt_dump += '\n' if len(gt) else ''
+        proposal_dump = '\n'.join(
+            ['{} {:.04f} {:.04f} {} {}'.format(*x) for x in proposal])
+        proposal_dump += '\n' if len(proposal) else ''
+
+        processed_proposal_list.append(
+            f'# {idx}\n{frame_path}\n{num_frames}\n1'
+            f'\n{len(gt)}\n{gt_dump}{len(proposal)}\n{proposal_dump}')
+
+    with open(proposal_file, 'w') as f:
+        f.writelines(processed_proposal_list)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Denormalize proposal file')
+    parser.add_argument(
+        'dataset',
+        type=str,
+        choices=['thumos14'],
+        help='dataset to be denormalize proposal file')
+    parser.add_argument(
+        '--norm-proposal-file',
+        type=str,
+        help='normalized proposal file to be denormalize')
+    parser.add_argument(
+        '--data-prefix',
+        type=str,
+        help='path to a directory where rawframes are held')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    print(f'Converting from {args.norm_proposal_file}.')
+    frame_dict = parse_directory(args.data_prefix)
+    process_norm_proposal_file(args.norm_proposal_file, frame_dict)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/data/diving48/README.md b/tools/data/diving48/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a441be78eb072de81d956c8d663ddf85189d4c14
--- /dev/null
+++ b/tools/data/diving48/README.md
@@ -0,0 +1,143 @@
+# Preparing Diving48
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{li2018resound,
+  title={Resound: Towards action recognition without representation bias},
+  author={Li, Yingwei and Li, Yi and Vasconcelos, Nuno},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  pages={513--528},
+  year={2018}
+}
+```
+
+For basic dataset information, you can refer to the official dataset [website](http://www.svcl.ucsd.edu/projects/resound/dataset.html).
+
+`````{tabs}
+
+````{group-tab} Download by MIM
+MIM supports downloading from OpenDataLab and preprocessing Diving48 dataset with one command line.
+```Bash
+# install OpenXlab CLI tools
+pip install -U openxlab
+# log in OpenXLab
+openxlab login
+# download and preprocess by MIM
+mim download mmaction2 --dataset diving48
+```
+
+````
+
+````{group-tab} Download form Official Source
+
+## Step 1. Prepare Annotations
+
+You can run the following script to download annotations (considering the correctness of annotation files, we only download V2 version here).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/diving48/`.
+
+```shell
+bash download_annotations.sh
+```
+
+## Step 2. Prepare Videos
+
+You can run the following script to download videos.
+
+```shell
+bash download_videos.sh
+```
+
+## Step 3. Prepare RGB and Flow
+
+This part is **optional** if you only want to use the video loader.
+
+The frames provided in official compressed file are not complete. You may need to go through the following extraction steps to get the complete frames.
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
+
+You can run the following script to soft link SSD.
+
+```shell
+# execute these two line (Assume the SSD is mounted at "/mnt/SSD/")
+mkdir /mnt/SSD/diving48_extracted/
+ln -s /mnt/SSD/diving48_extracted/ ../../../data/diving48/rawframes
+```
+
+If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow.
+
+```shell
+cd $MMACTION2/tools/data/diving48/
+bash extract_rgb_frames.sh
+```
+
+If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images.
+
+```shell
+cd $MMACTION2/tools/data/diving48/
+bash extract_rgb_frames_opencv.sh
+```
+
+If both are required, run the following script to extract frames.
+
+```shell
+cd $MMACTION2/tools/data/diving48/
+bash extract_frames.sh
+```
+
+## Step 4. Generate File List
+
+you can run the follow script to generate file list in the format of rawframes and videos.
+
+```shell
+bash generate_videos_filelist.sh
+bash generate_rawframes_filelist.sh
+```
+
+````
+`````
+
+### Check Directory Structure
+
+After the whole data process for Diving48 preparation,
+you will get the rawframes (RGB + Flow), videos and annotation files for Diving48.
+
+In the context of the whole project (for Diving48 only), the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── diving48
+│   │   ├── diving48_{train,val}_list_rawframes.txt
+│   │   ├── diving48_{train,val}_list_videos.txt
+│   │   ├── annotations (optinonal)
+│   |   |   ├── Diving48_V2_train.json
+│   |   |   ├── Diving48_V2_test.json
+│   |   |   ├── Diving48_vocab.json
+│   |   ├── videos
+│   |   |   ├── _8Vy3dlHg2w_00000.mp4
+│   |   |   ├── _8Vy3dlHg2w_00001.mp4
+│   |   |   ├── ...
+│   |   ├── rawframes (optional)
+│   |   |   ├── 2x00lRzlTVQ_00000
+│   |   |   |   ├── img_00001.jpg
+│   |   |   |   ├── img_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_x_00001.jpg
+│   |   |   |   ├── flow_x_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_y_00001.jpg
+│   |   |   |   ├── flow_y_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   ├── 2x00lRzlTVQ_00001
+│   |   |   ├── ...
+```
+
+For training and evaluating on Diving48, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/diving48/README_zh-CN.md b/tools/data/diving48/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..9f3d85a757b8e5b507af638aecaed593726d6e14
--- /dev/null
+++ b/tools/data/diving48/README_zh-CN.md
@@ -0,0 +1,141 @@
+# 准备 Diving48
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{li2018resound,
+  title={Resound: Towards action recognition without representation bias},
+  author={Li, Yingwei and Li, Yi and Vasconcelos, Nuno},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  pages={513--528},
+  year={2018}
+}
+```
+
+用户可参考该数据集的 [官网](http://www.svcl.ucsd.edu/projects/resound/dataset.html)，以获取数据集相关的基本信息。
+
+`````{tabs}
+
+````{group-tab} 使用 MIM 下载
+# MIM 支持下载 Diving48 数据集。用户可以通过一行命令，从 OpenDataLab 进行下载，并进行预处理。
+```Bash
+# 安装 OpenXLab CLI 工具
+pip install -U openxlab
+# 登录 OpenXLab
+openxlab login
+# 通过 MIM 进行数据集下载，预处理。注意这将花费较长时间
+mim download mmaction2 --dataset diving48
+```
+
+````
+
+````{group-tab} 从官方源下载
+## 步骤 1. 下载标注文件
+
+用户可以使用以下命令下载标注文件（考虑到标注的准确性，这里仅下载 V2 版本）。在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/diving48/`。
+
+```shell
+bash download_annotations.sh
+```
+
+## 步骤 2. 准备视频
+
+用户可以使用以下命令下载视频。
+
+```shell
+bash download_videos.sh
+```
+
+## Step 3. 抽取 RGB 帧和光流
+
+如果用户只想使用视频加载训练，则该部分是 **可选项**。
+
+官网提供的帧压缩包并不完整。若想获取完整的数据，可以使用以下步骤解帧。
+
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+如果拥有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 中。
+
+可以运行以下命令为 SSD 建立软链接。
+
+```shell
+# 执行这两行进行抽取（假设 SSD 挂载在 "/mnt/SSD/"）
+mkdir /mnt/SSD/diving48_extracted/
+ln -s /mnt/SSD/diving48_extracted/ ../../../data/diving48/rawframes
+```
+
+如果用户需要抽取 RGB 帧（因为抽取光流的过程十分耗时），可以考虑运行以下命令使用 denseflow **只抽取 RGB 帧**。
+
+```shell
+cd $MMACTION2/tools/data/diving48/
+bash extract_rgb_frames.sh
+```
+
+如果用户没有安装 denseflow，则可以运行以下命令使用 OpenCV 抽取 RGB 帧。然而，该方法只能抽取与原始视频分辨率相同的帧。
+
+```shell
+cd $MMACTION2/tools/data/diving48/
+bash extract_rgb_frames_opencv.sh
+```
+
+如果用户想抽取 RGB 帧和光流，则可以运行以下脚本进行抽取。
+
+```shell
+cd $MMACTION2/tools/data/diving48/
+bash extract_frames.sh
+```
+
+## 步骤 4. 生成文件列表
+
+用户可以通过运行以下命令生成帧和视频格式的文件列表。
+
+```shell
+bash generate_videos_filelist.sh
+bash generate_rawframes_filelist.sh
+```
+
+````
+`````
+
+### 检查文件夹结构
+
+在完成所有 Diving48 数据集准备流程后，
+用户可以获得对应的 RGB + 光流文件，视频文件以及标注文件。
+
+在整个 MMAction2 文件夹下，Diving48 的文件结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── diving48
+│   │   ├── diving48_{train,val}_list_rawframes.txt
+│   │   ├── diving48_{train,val}_list_videos.txt
+│   │   ├── annotations（可选）
+│   |   |   ├── Diving48_V2_train.json
+│   |   |   ├── Diving48_V2_test.json
+│   |   |   ├── Diving48_vocab.json
+│   |   ├── videos
+│   |   |   ├── _8Vy3dlHg2w_00000.mp4
+│   |   |   ├── _8Vy3dlHg2w_00001.mp4
+│   |   |   ├── ...
+│   |   ├── rawframes（可选）
+│   |   |   ├── 2x00lRzlTVQ_00000
+│   |   |   |   ├── img_00001.jpg
+│   |   |   |   ├── img_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_x_00001.jpg
+│   |   |   |   ├── flow_x_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_y_00001.jpg
+│   |   |   |   ├── flow_y_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   ├── 2x00lRzlTVQ_00001
+│   |   |   ├── ...
+```
+
+关于对 Diving48 进行训练和验证，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/diving48/download_annotations.sh b/tools/data/diving48/download_annotations.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19b8bfb9d4ebde5b86ba0ae9da95a3568b88f97a
--- /dev/null
+++ b/tools/data/diving48/download_annotations.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+DATA_DIR="../../../data/diving48/annotations"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+cd ${DATA_DIR}
+
+wget http://www.svcl.ucsd.edu/projects/resound/Diving48_vocab.json
+wget http://www.svcl.ucsd.edu/projects/resound/Diving48_V2_train.json
+wget http://www.svcl.ucsd.edu/projects/resound/Diving48_V2_test.json
+
+cd -
diff --git a/tools/data/diving48/download_videos.sh b/tools/data/diving48/download_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..210cbb7b945b74e53fe207338115d2c1d5f6b6c2
--- /dev/null
+++ b/tools/data/diving48/download_videos.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+DATA_DIR="../../../data/diving48/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+cd ${DATA_DIR}
+
+wget http://www.svcl.ucsd.edu/projects/resound/Diving48_rgb.tar.gz --no-check-certificate
+tar -zxvf Diving48_rgb.tar.gz
+mv ./rgb ./videos
+
+cd -
diff --git a/tools/data/diving48/extract_frames.sh b/tools/data/diving48/extract_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..704d53a13509200adac714e452655556451be342
--- /dev/null
+++ b/tools/data/diving48/extract_frames.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/diving48/videos/ ../../data/diving48/rawframes/ --task both --level 1 --flow-type tvl1 --ext mp4
+echo "Raw frames (RGB and tv-l1) Generated"
+cd -
diff --git a/tools/data/diving48/extract_rgb_frames.sh b/tools/data/diving48/extract_rgb_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..13990a464a5297e3e0891867960242a0575fdee3
--- /dev/null
+++ b/tools/data/diving48/extract_rgb_frames.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/diving48/videos/ ../../data/diving48/rawframes/ --task rgb --level 1  --ext mp4
+echo "Genearte raw frames (RGB only)"
+
+cd -
diff --git a/tools/data/diving48/extract_rgb_frames_opencv.sh b/tools/data/diving48/extract_rgb_frames_opencv.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bec75ef2fd8ad32240f1a97f407ce002768ee277
--- /dev/null
+++ b/tools/data/diving48/extract_rgb_frames_opencv.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/diving48/videos/ ../../data/diving48/rawframes/ --task rgb --level 1 --ext mp4 --use-opencv
+echo "Genearte raw frames (RGB only)"
+
+cd -
diff --git a/tools/data/diving48/generate_rawframes_filelist.sh b/tools/data/diving48/generate_rawframes_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..48043324b0118773264912d2a195aa6793d8b2da
--- /dev/null
+++ b/tools/data/diving48/generate_rawframes_filelist.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py diving48 data/diving48/rawframes/ --num-split 1 --level 1 --subset train --format rawframes --shuffle
+PYTHONPATH=. python tools/data/build_file_list.py diving48 data/diving48/rawframes/ --num-split 1 --level 1 --subset val --format rawframes --shuffle
+echo "Filelist for rawframes generated."
+
+cd tools/data/diving48/
diff --git a/tools/data/diving48/generate_videos_filelist.sh b/tools/data/diving48/generate_videos_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fb170470acbd827fb3c9444ce9b80d69b0a24372
--- /dev/null
+++ b/tools/data/diving48/generate_videos_filelist.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py diving48 data/diving48/videos/ --num-split 1 --level 1 --subset train --format videos --shuffle
+PYTHONPATH=. python tools/data/build_file_list.py diving48 data/diving48/videos/ --num-split 1 --level 1 --subset val --format videos --shuffle
+echo "Filelist for videos generated."
+
+cd tools/data/diving48/
diff --git a/tools/data/diving48/label_map.txt b/tools/data/diving48/label_map.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7b7569da5675bae6693d147d7701896bf774b4d9
--- /dev/null
+++ b/tools/data/diving48/label_map.txt
@@ -0,0 +1,48 @@
+Back+15som+05Twis+FREE
+Back+15som+15Twis+FREE
+Back+15som+25Twis+FREE
+Back+15som+NoTwis+PIKE
+Back+15som+NoTwis+TUCK
+Back+25som+15Twis+PIKE
+Back+25som+25Twis+PIKE
+Back+25som+NoTwis+PIKE
+Back+25som+NoTwis+TUCK
+Back+2som+15Twis+FREE
+Back+2som+25Twis+FREE
+Back+35som+NoTwis+PIKE
+Back+35som+NoTwis+TUCK
+Back+3som+NoTwis+PIKE
+Back+3som+NoTwis+TUCK
+Back+Dive+NoTwis+PIKE
+Back+Dive+NoTwis+TUCK
+Forward+15som+1Twis+FREE
+Forward+15som+2Twis+FREE
+Forward+15som+NoTwis+PIKE
+Forward+1som+NoTwis+PIKE
+Forward+25som+1Twis+PIKE
+Forward+25som+2Twis+PIKE
+Forward+25som+3Twis+PIKE
+Forward+25som+NoTwis+PIKE
+Forward+25som+NoTwis+TUCK
+Forward+35som+NoTwis+PIKE
+Forward+35som+NoTwis+TUCK
+Forward+45som+NoTwis+TUCK
+Forward+Dive+NoTwis+PIKE
+Forward+Dive+NoTwis+STR
+Inward+15som+NoTwis+PIKE
+Inward+15som+NoTwis+TUCK
+Inward+25som+NoTwis+PIKE
+Inward+25som+NoTwis+TUCK
+Inward+35som+NoTwis+TUCK
+Inward+Dive+NoTwis+PIKE
+Reverse+15som+05Twis+FREE
+Reverse+15som+15Twis+FREE
+Reverse+15som+25Twis+FREE
+Reverse+15som+35Twis+FREE
+Reverse+15som+NoTwis+PIKE
+Reverse+25som+15Twis+PIKE
+Reverse+25som+NoTwis+PIKE
+Reverse+25som+NoTwis+TUCK
+Reverse+35som+NoTwis+TUCK
+Reverse+Dive+NoTwis+PIKE
+Reverse+Dive+NoTwis+TUCK
diff --git a/tools/data/diving48/preprocess.sh b/tools/data/diving48/preprocess.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75b649b50a978ddf9a4f2e10bf54abe8bcb27f1d
--- /dev/null
+++ b/tools/data/diving48/preprocess.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+cat $DOWNLOAD_DIR/OpenDataLab___diving48/raw/*.tar.gz.*  | tar -xvz -C $(dirname $DATA_ROOT)
+tar -xvf $DATA_ROOT/diving48.tar -C $(dirname $DATA_ROOT)
+rm $DATA_ROOT/diving48.tar
diff --git a/tools/data/extract_audio.py b/tools/data/extract_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b754dbad38deb04cb2fcffaf15f959fd0ec92ce
--- /dev/null
+++ b/tools/data/extract_audio.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import os
+import os.path as osp
+from multiprocessing import Pool
+
+import mmengine
+
+
+def extract_audio_wav(line):
+    """Extract the audio wave from video streams using FFMPEG."""
+    video_id, _ = osp.splitext(osp.basename(line))
+    video_dir = osp.dirname(line)
+    video_rel_dir = osp.relpath(video_dir, args.root)
+    dst_dir = osp.join(args.dst_root, video_rel_dir)
+    os.popen(f'mkdir -p {dst_dir}')
+    try:
+        if osp.exists(f'{dst_dir}/{video_id}.wav'):
+            return
+        cmd = f'ffmpeg -i ./{line}  -map 0:a  -y {dst_dir}/{video_id}.wav'
+        os.popen(cmd)
+    except BaseException:
+        with open('extract_wav_err_file.txt', 'a+') as f:
+            f.write(f'{line}\n')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Extract audios')
+    parser.add_argument('root', type=str, help='source video directory')
+    parser.add_argument('dst_root', type=str, help='output audio directory')
+    parser.add_argument(
+        '--level', type=int, default=2, help='directory level of data')
+    parser.add_argument(
+        '--ext',
+        type=str,
+        default='mp4',
+        choices=['avi', 'mp4', 'webm'],
+        help='video file extensions')
+    parser.add_argument(
+        '--num-worker', type=int, default=8, help='number of workers')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    mmengine.mkdir_or_exist(args.dst_root)
+
+    print('Reading videos from folder: ', args.root)
+    print('Extension of videos: ', args.ext)
+    fullpath_list = glob.glob(args.root + '/*' * args.level + '.' + args.ext)
+    done_fullpath_list = glob.glob(args.dst_root + '/*' * args.level + '.wav')
+    print('Total number of videos found: ', len(fullpath_list))
+    print('Total number of videos extracted finished: ',
+          len(done_fullpath_list))
+
+    pool = Pool(args.num_worker)
+    pool.map(extract_audio_wav, fullpath_list)
diff --git a/tools/data/gym/README.md b/tools/data/gym/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..56e5e7693f6d6232e5dcd9c394b67122d0eb12c0
--- /dev/null
+++ b/tools/data/gym/README.md
@@ -0,0 +1,109 @@
+# Preparing GYM
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{shao2020finegym,
+  title={Finegym: A hierarchical video dataset for fine-grained action understanding},
+  author={Shao, Dian and Zhao, Yue and Dai, Bo and Lin, Dahua},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={2616--2625},
+  year={2020}
+}
+```
+
+For basic dataset information, please refer to the official [project](https://sdolivia.github.io/FineGym/) and the [paper](https://arxiv.org/abs/2004.06704).
+We currently provide the data pre-processing pipeline for GYM99.
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/gym/`.
+
+## Step 1. Prepare Annotations
+
+First of all, you can run the following script to prepare annotations.
+
+```shell
+bash download_annotations.sh
+```
+
+## Step 2. Prepare Videos
+
+Then, you can run the following script to prepare videos.
+The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time.
+
+```shell
+bash download_videos.sh
+```
+
+## Step 3. Trim Videos into Events
+
+First, you need to trim long videos into events based on the annotation of GYM with the following scripts.
+
+```shell
+python trim_event.py
+```
+
+## Step 4. Trim Events into Subactions
+
+Then, you need to trim events into subactions based on the annotation of GYM with the following scripts. We use the two stage trimming for better efficiency (trimming multiple short clips from a long video can be extremely inefficient, since you need to go over the video many times).
+
+```shell
+python trim_subaction.py
+```
+
+## Step 5. Extract RGB and Flow
+
+This part is **optional** if you only want to use the video loader for RGB model training.
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+Run the following script to extract both rgb and flow using "tvl1" algorithm.
+
+```shell
+bash extract_frames.sh
+```
+
+## Step 6. Generate file list for GYM99 based on extracted subactions
+
+You can use the following script to generate train / val lists for GYM99.
+
+```shell
+python generate_file_list.py
+```
+
+## Step 7. Folder Structure
+
+After the whole data pipeline for GYM preparation. You can get the subaction clips, event clips, raw videos and GYM99 train/val lists.
+
+In the context of the whole project (for GYM only), the full folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── gym
+|   |   ├── annotations
+|   |   |   ├── gym99_train_org.txt
+|   |   |   ├── gym99_val_org.txt
+|   |   |   ├── gym99_train.txt
+|   |   |   ├── gym99_val.txt
+|   |   |   ├── annotation.json
+|   |   |   └── event_annotation.json
+│   │   ├── videos
+|   |   |   ├── 0LtLS9wROrk.mp4
+|   |   |   ├── ...
+|   |   |   └── zfqS-wCJSsw.mp4
+│   │   ├── events
+|   |   |   ├── 0LtLS9wROrk_E_002407_002435.mp4
+|   |   |   ├── ...
+|   |   |   └── zfqS-wCJSsw_E_006732_006824.mp4
+│   │   ├── subactions
+|   |   |   ├── 0LtLS9wROrk_E_002407_002435_A_0003_0005.mp4
+|   |   |   ├── ...
+|   |   |   └── zfqS-wCJSsw_E_006244_006252_A_0000_0007.mp4
+|   |   └── subaction_frames
+```
+
+For training and evaluating on GYM, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/gym/README_zh-CN.md b/tools/data/gym/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..626b46fc11c55e06972b4a6f29600b7440072a68
--- /dev/null
+++ b/tools/data/gym/README_zh-CN.md
@@ -0,0 +1,109 @@
+# 准备 GYM
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{shao2020finegym,
+  title={Finegym: A hierarchical video dataset for fine-grained action understanding},
+  author={Shao, Dian and Zhao, Yue and Dai, Bo and Lin, Dahua},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={2616--2625},
+  year={2020}
+}
+```
+
+请参照 [项目主页](https://sdolivia.github.io/FineGym/) 及 [原论文](https://sdolivia.github.io/FineGym/) 以获取数据集基本信息。
+MMAction2 当前支持 GYM99 的数据集预处理。
+在开始之前，用户需确保当前目录为 `$MMACTION2/tools/data/gym/`。
+
+## 1. 准备标注文件
+
+首先，用户可以使用如下脚本下载标注文件并进行预处理：
+
+```shell
+bash download_annotations.sh
+```
+
+## 2. 准备视频
+
+用户可以使用以下脚本准备视频，视频准备代码修改自 [ActivityNet 爬虫](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)。
+注意这一步骤将花费较长时间。
+
+```shell
+bash download_videos.sh
+```
+
+## 3. 裁剪长视频至动作级别
+
+用户首先需要使用以下脚本将 GYM 中的长视频依据标注文件裁剪至动作级别。
+
+```shell
+python trim_event.py
+```
+
+## 4. 裁剪动作视频至分动作级别
+
+随后，用户需要使用以下脚本将 GYM 中的动作视频依据标注文件裁剪至分动作级别。将视频的裁剪分成两个级别可以带来更高的效率（在长视频中裁剪多个极短片段异常耗时）。
+
+```shell
+python trim_subaction.py
+```
+
+## 5. 提取 RGB 帧和光流
+
+如果用户仅使用 video loader，则可以跳过本步。
+
+在提取之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+用户可使用如下脚本同时抽取 RGB 帧和光流（提取光流时使用 tvl1 算法）：
+
+```shell
+bash extract_frames.sh
+```
+
+## 6. 基于提取出的分动作生成文件列表
+
+用户可使用以下脚本为 GYM99 生成训练及测试的文件列表：
+
+```shell
+python generate_file_list.py
+```
+
+## 7. 目录结构
+
+在完整完成 GYM 的数据处理后，将得到帧文件夹（RGB 帧和光流帧），动作视频片段，分动作视频片段以及训练测试所用标注文件。
+
+在整个项目目录下（仅针对 GYM），完整目录结构如下所示：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── gym
+|   |   ├── annotations
+|   |   |   ├── gym99_train_org.txt
+|   |   |   ├── gym99_val_org.txt
+|   |   |   ├── gym99_train.txt
+|   |   |   ├── gym99_val.txt
+|   |   |   ├── annotation.json
+|   |   |   └── event_annotation.json
+│   │   ├── videos
+|   |   |   ├── 0LtLS9wROrk.mp4
+|   |   |   ├── ...
+|   |   |   └── zfqS-wCJSsw.mp4
+│   │   ├── events
+|   |   |   ├── 0LtLS9wROrk_E_002407_002435.mp4
+|   |   |   ├── ...
+|   |   |   └── zfqS-wCJSsw_E_006732_006824.mp4
+│   │   ├── subactions
+|   |   |   ├── 0LtLS9wROrk_E_002407_002435_A_0003_0005.mp4
+|   |   |   ├── ...
+|   |   |   └── zfqS-wCJSsw_E_006244_006252_A_0000_0007.mp4
+|   |   └── subaction_frames
+```
+
+关于 GYM 数据集上的训练与测试，请参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/gym/download.py b/tools/data/gym/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b89d38539c9b4668b1a086a1d844ea6dc8801ff
--- /dev/null
+++ b/tools/data/gym/download.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This scripts is copied from
+# https://github.com/activitynet/ActivityNet/blob/master/Crawler/Kinetics/download.py  # noqa: E501
+# The code is licensed under the MIT licence.
+import argparse
+import os
+import ssl
+import subprocess
+
+import mmengine
+from joblib import Parallel, delayed
+
+ssl._create_default_https_context = ssl._create_unverified_context
+
+
+def download(video_identifier,
+             output_filename,
+             num_attempts=5,
+             url_base='https://www.youtube.com/watch?v='):
+    """Download a video from youtube if exists and is not blocked.
+    arguments:
+    ---------
+    video_identifier: str
+        Unique YouTube video identifier (11 characters)
+    output_filename: str
+        File path where the video will be stored.
+    """
+    # Defensive argument checking.
+    assert isinstance(video_identifier, str), 'video_identifier must be string'
+    assert isinstance(output_filename, str), 'output_filename must be string'
+    assert len(video_identifier) == 11, 'video_identifier must have length 11'
+
+    status = False
+
+    if not os.path.exists(output_filename):
+        command = [
+            'youtube-dl', '--quiet', '--no-warnings', '--no-check-certificate',
+            '-f', 'mp4', '-o',
+            '"%s"' % output_filename,
+            '"%s"' % (url_base + video_identifier)
+        ]
+        command = ' '.join(command)
+        print(command)
+        attempts = 0
+        while True:
+            try:
+                subprocess.check_output(
+                    command, shell=True, stderr=subprocess.STDOUT)
+            except subprocess.CalledProcessError:
+                attempts += 1
+                if attempts == num_attempts:
+                    return status, 'Fail'
+            else:
+                break
+    # Check if the video was successfully saved.
+    status = os.path.exists(output_filename)
+    return status, 'Downloaded'
+
+
+def download_wrapper(youtube_id, output_dir):
+    """Wrapper for parallel processing purposes."""
+    # we do this to align with names in annotations
+    output_filename = os.path.join(output_dir, youtube_id + '.mp4')
+    if os.path.exists(output_filename):
+        status = tuple([youtube_id, True, 'Exists'])
+        return status
+
+    downloaded, log = download(youtube_id, output_filename)
+    status = tuple([youtube_id, downloaded, log])
+    return status
+
+
+def main(input, output_dir, num_jobs=24):
+    # Reading and parsing ActivityNet.
+    youtube_ids = mmengine.load(input).keys()
+    # Creates folders where videos will be saved later.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Download all clips.
+    if num_jobs == 1:
+        status_list = []
+        for index in youtube_ids:
+            status_list.append(download_wrapper(index, output_dir))
+    else:
+        status_list = Parallel(n_jobs=num_jobs)(
+            delayed(download_wrapper)(index, output_dir)
+            for index in youtube_ids)
+
+    # Save download report.
+    mmengine.dump(status_list, 'download_report.json')
+
+
+if __name__ == '__main__':
+    description = 'Helper script for downloading GYM videos.'
+    p = argparse.ArgumentParser(description=description)
+    p.add_argument('input', type=str, help='The gym annotation file')
+    p.add_argument(
+        'output_dir', type=str, help='Output directory to save videos.')
+    p.add_argument('-n', '--num-jobs', type=int, default=24)
+    main(**vars(p.parse_args()))
diff --git a/tools/data/gym/download_annotations.sh b/tools/data/gym/download_annotations.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f0a1bd728a2fddaa98797307b9da4d53509c5945
--- /dev/null
+++ b/tools/data/gym/download_annotations.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+set -e
+
+DATA_DIR="../../../data/gym/annotations"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget https://sdolivia.github.io/FineGym/resources/dataset/finegym_annotation_info_v1.0.json -O $DATA_DIR/annotation.json
+wget https://sdolivia.github.io/FineGym/resources/dataset/gym99_train_element_v1.0.txt -O $DATA_DIR/gym99_train_org.txt
+wget https://sdolivia.github.io/FineGym/resources/dataset/gym99_val_element.txt -O $DATA_DIR/gym99_val_org.txt
diff --git a/tools/data/gym/download_videos.sh b/tools/data/gym/download_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4f6fbce6e83255756502b280fa7a34939839b469
--- /dev/null
+++ b/tools/data/gym/download_videos.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+# set up environment
+conda env create -f environment.yml
+source activate gym
+pip install mmengine
+pip install --upgrade youtube-dl
+
+DATA_DIR="../../../data/gym"
+ANNO_DIR="../../../data/gym/annotations"
+python download.py ${ANNO_DIR}/annotation.json ${DATA_DIR}/videos
+
+source deactivate gym
+conda remove -n gym --all
diff --git a/tools/data/gym/environment.yml b/tools/data/gym/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..95469fd8be53f80450ac99a41eac5d79202f5d49
--- /dev/null
+++ b/tools/data/gym/environment.yml
@@ -0,0 +1,36 @@
+name: gym
+channels:
+  - anaconda
+  - menpo
+  - conda-forge
+  - defaults
+dependencies:
+  - ca-certificates=2020.1.1
+  - certifi=2020.4.5.1
+  - ffmpeg=2.8.6
+  - libcxx=10.0.0
+  - libedit=3.1.20181209
+  - libffi=3.3
+  - ncurses=6.2
+  - openssl=1.1.1g
+  - pip=20.0.2
+  - python=3.7.7
+  - readline=8.0
+  - setuptools=46.4.0
+  - sqlite=3.31.1
+  - tk=8.6.8
+  - wheel=0.34.2
+  - xz=5.2.5
+  - zlib=1.2.11
+  - pip:
+    - decorator==4.4.2
+    - intel-openmp==2019.0
+    - joblib==0.15.1
+    - mkl==2019.0
+    - numpy==1.18.4
+    - olefile==0.46
+    - pandas==1.0.3
+    - python-dateutil==2.8.1
+    - pytz==2020.1
+    - six==1.14.0
+    - youtube-dl
diff --git a/tools/data/gym/extract_frames.sh b/tools/data/gym/extract_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..594bc898e804c1e7d31495f1a5c5695eeedf6341
--- /dev/null
+++ b/tools/data/gym/extract_frames.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/gym/subactions/ ../../data/gym/subaction_frames/ --level 1 --flow-type tvl1 --ext mp4 --task both --new-short 256
+echo "Raw frames (RGB and tv-l1) Generated"
+
+cd gym/
diff --git a/tools/data/gym/generate_file_list.py b/tools/data/gym/generate_file_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ea121a803dae88b1cfe399cc9b6b410033ecb3
--- /dev/null
+++ b/tools/data/gym/generate_file_list.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+
+annotation_root = '../../../data/gym/annotations'
+data_root = '../../../data/gym/subactions'
+frame_data_root = '../../../data/gym/subaction_frames'
+
+videos = os.listdir(data_root)
+videos = set(videos)
+
+train_file_org = osp.join(annotation_root, 'gym99_train_org.txt')
+val_file_org = osp.join(annotation_root, 'gym99_val_org.txt')
+train_file = osp.join(annotation_root, 'gym99_train.txt')
+val_file = osp.join(annotation_root, 'gym99_val.txt')
+train_frame_file = osp.join(annotation_root, 'gym99_train_frame.txt')
+val_frame_file = osp.join(annotation_root, 'gym99_val_frame.txt')
+
+train_org = open(train_file_org).readlines()
+train_org = [x.strip().split() for x in train_org]
+train = [x for x in train_org if x[0] + '.mp4' in videos]
+if osp.exists(frame_data_root):
+    train_frames = []
+    for line in train:
+        length = len(os.listdir(osp.join(frame_data_root, line[0])))
+        train_frames.append([line[0], str(length // 3), line[1]])
+    train_frames = [' '.join(x) for x in train_frames]
+    with open(train_frame_file, 'w') as fout:
+        fout.write('\n'.join(train_frames))
+
+train = [x[0] + '.mp4 ' + x[1] for x in train]
+with open(train_file, 'w') as fout:
+    fout.write('\n'.join(train))
+
+val_org = open(val_file_org).readlines()
+val_org = [x.strip().split() for x in val_org]
+val = [x for x in val_org if x[0] + '.mp4' in videos]
+if osp.exists(frame_data_root):
+    val_frames = []
+    for line in val:
+        length = len(os.listdir(osp.join(frame_data_root, line[0])))
+        val_frames.append([line[0], str(length // 3), line[1]])
+    val_frames = [' '.join(x) for x in val_frames]
+    with open(val_frame_file, 'w') as fout:
+        fout.write('\n'.join(val_frames))
+
+val = [x[0] + '.mp4 ' + x[1] for x in val]
+with open(val_file, 'w') as fout:
+    fout.write('\n'.join(val))
diff --git a/tools/data/gym/label_map.txt b/tools/data/gym/label_map.txt
new file mode 100644
index 0000000000000000000000000000000000000000..79f4010816b30047a74b1b71786c7d94770ebdbb
--- /dev/null
+++ b/tools/data/gym/label_map.txt
@@ -0,0 +1,99 @@
+(VT) round-off, flic-flac with 0.5 turn on, stretched salto forward with 0.5 turn off
+(VT) round-off, flic-flac on, stretched salto backward with 2 turn off
+(VT) round-off, flic-flac on, stretched salto backward with 1 turn off
+(VT) round-off, flic-flac on, stretched salto backward with 1.5 turn off
+(VT) round-off, flic-flac on, stretched salto backward with 2.5 turn off
+(VT) round-off, flic-flac on, stretched salto backward off
+(FX) switch leap with 0.5 turn
+(FX) switch leap with 1 turn
+(FX) split leap with 1 turn
+(FX) split leap with 1.5 turn or more
+(FX) switch leap (leap forward with leg change to cross split)
+(FX) split jump with 1 turn
+(FX) split jump (leg separation 180 degree parallel to the floor)
+(FX) johnson with additional 0.5 turn
+(FX) straddle pike or side split jump with 1 turn
+(FX) switch leap to ring position
+(FX) stag jump
+(FX) 2 turn with free leg held upward in 180 split position throughout turn
+(FX) 2 turn in tuck stand on one leg, free leg straight throughout turn
+(FX) 3 turn on one leg, free leg optional below horizontal
+(FX) 2 turn on one leg, free leg optional below horizontal
+(FX) 1 turn on one leg, free leg optional below horizontal
+(FX) 2 turn or more with heel of free leg forward at horizontal throughout turn
+(FX) 1 turn with heel of free leg forward at horizontal throughout turn
+(FX) arabian double salto tucked
+(FX) salto forward tucked
+(FX) aerial walkover forward
+(FX) salto forward stretched with 2 twist
+(FX) salto forward stretched with 1 twist
+(FX) salto forward stretched with 1.5 twist
+(FX) salto forward stretched, feet land together
+(FX) double salto backward stretched
+(FX) salto backward stretched with 3 twist
+(FX) salto backward stretched with 2 twist
+(FX) salto backward stretched with 2.5 twist
+(FX) salto backward stretched with 1.5 twist
+(FX) double salto backward tucked with 2 twist
+(FX) double salto backward tucked with 1 twist
+(FX) double salto backward tucked
+(FX) double salto backward piked with 1 twist
+(FX) double salto backward piked
+(BB) sissone (leg separation 180 degree on the diagonal to the floor, take off two feet, land on one foot)
+(BB) split jump with 0.5 turn in side position
+(BB) split jump
+(BB) straddle pike jump or side split jump
+(BB) split ring jump (ring jump with front leg horizontal to the floor)
+(BB) switch leap with 0.5 turn
+(BB) switch leap (leap forward with leg change)
+(BB) split leap forward
+(BB) johnson (leap forward with leg change and 0.25 turn to side split or straddle pike position)
+(BB) switch leap to ring position
+(BB) sheep jump (jump with upper back arch and head release with feet to head height/closed Ring)
+(BB) wolf hop or jump (hip angle at 45, knees together)
+(BB) 1 turn with heel of free leg forward at horizontal throughout turn
+(BB) 2 turn on one leg, free leg optional below horizontal
+(BB) 1 turn on one leg, free leg optional below horizontal
+(BB) 2 turn in tuck stand on one leg, free leg optional
+(BB) salto backward tucked with 1 twist
+(BB) salto backward tucked
+(BB) salto backward stretched-step out (feet land successively)
+(BB) salto backward stretched with legs together
+(BB) salto sideward tucked, take off from one leg to side stand
+(BB) free aerial cartwheel landing in cross position
+(BB) salto forward tucked to cross stand
+(BB) free aerial walkover forward, landing on one or both feet
+(BB) jump backward, flic-flac take-off with 0.5 twist through handstand to walkover forward, also with support on one arm
+(BB) flic-flac to land on both feet
+(BB) flic-flac with step-out, also with support on one arm
+(BB) round-off
+(BB) double salto backward tucked
+(BB) salto backward tucked
+(BB) double salto backward piked
+(BB) salto backward stretched with 2 twist
+(BB) salto backward stretched with 2.5 twist
+(UB) pike sole circle backward with 1 turn to handstand
+(UB) pike sole circle backward with 0.5 turn to handstand
+(UB) pike sole circle backward to handstand
+(UB) giant circle backward with 1 turn to handstand
+(UB) giant circle backward with 0.5 turn to handstand
+(UB) giant circle backward
+(UB) giant circle forward with 1 turn on one arm before handstand phase
+(UB) giant circle forward with 0.5 turn to handstand
+(UB) giant circle forward
+(UB) clear hip circle backward to handstand
+(UB) clear pike circle backward with 1 turn to handstand
+(UB) clear pike circle backward with 0.5 turn to handstand
+(UB) clear pike circle backward to handstand
+(UB) stalder backward with 1 turn to handstand
+(UB) stalder backward to handstand
+(UB) counter straddle over high bar to hang
+(UB) counter piked over high bar to hang
+(UB) (swing backward or front support) salto forward straddled to hang on high bar
+(UB) (swing backward) salto forward piked to hang on high bar
+(UB) (swing forward or hip circle backward) salto backward with 0.5 turn piked to hang on high bar
+(UB) transition flight from high bar to low bar
+(UB) transition flight from low bar to high bar
+(UB) (swing forward) double salto backward tucked with 1 turn
+(UB) (swing backward) double salto forward tucked
+(UB) (swing forward) double salto backward stretched
diff --git a/tools/data/gym/trim_event.py b/tools/data/gym/trim_event.py
new file mode 100644
index 0000000000000000000000000000000000000000..006b0646391c73c42da37cb74b458e5d486f5d0f
--- /dev/null
+++ b/tools/data/gym/trim_event.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import subprocess
+
+import mmengine
+
+data_root = '../../../data/gym'
+video_root = f'{data_root}/videos'
+anno_root = f'{data_root}/annotations'
+anno_file = f'{anno_root}/annotation.json'
+
+event_anno_file = f'{anno_root}/event_annotation.json'
+event_root = f'{data_root}/events'
+
+videos = os.listdir(video_root)
+videos = set(videos)
+annotation = mmengine.load(anno_file)
+event_annotation = {}
+
+mmengine.mkdir_or_exist(event_root)
+
+for k, v in annotation.items():
+    if k + '.mp4' not in videos:
+        print(f'video {k} has not been downloaded')
+        continue
+
+    video_path = osp.join(video_root, k + '.mp4')
+
+    for event_id, event_anno in v.items():
+        timestamps = event_anno['timestamps'][0]
+        start_time, end_time = timestamps
+        event_name = k + '_' + event_id
+
+        output_filename = event_name + '.mp4'
+
+        command = [
+            'ffmpeg', '-i',
+            '"%s"' % video_path, '-ss',
+            str(start_time), '-t',
+            str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy',
+            '-threads', '8', '-loglevel', 'panic',
+            '"%s"' % osp.join(event_root, output_filename)
+        ]
+        command = ' '.join(command)
+        try:
+            subprocess.check_output(
+                command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError:
+            print(
+                f'Trimming of the Event {event_name} of Video {k} Failed',
+                flush=True)
+
+        segments = event_anno['segments']
+        if segments is not None:
+            event_annotation[event_name] = segments
+
+mmengine.dump(event_annotation, event_anno_file)
diff --git a/tools/data/gym/trim_subaction.py b/tools/data/gym/trim_subaction.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cecceaf2b5d12a5f667073a855980e30fddf80c
--- /dev/null
+++ b/tools/data/gym/trim_subaction.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import subprocess
+
+import mmengine
+
+data_root = '../../../data/gym'
+anno_root = f'{data_root}/annotations'
+
+event_anno_file = f'{anno_root}/event_annotation.json'
+event_root = f'{data_root}/events'
+subaction_root = f'{data_root}/subactions'
+
+events = os.listdir(event_root)
+events = set(events)
+annotation = mmengine.load(event_anno_file)
+
+mmengine.mkdir_or_exist(subaction_root)
+
+for k, v in annotation.items():
+    if k + '.mp4' not in events:
+        print(f'video {k[:11]} has not been downloaded '
+              f'or the event clip {k} not generated')
+        continue
+
+    video_path = osp.join(event_root, k + '.mp4')
+
+    for subaction_id, subaction_anno in v.items():
+        timestamps = subaction_anno['timestamps']
+        start_time, end_time = timestamps[0][0], timestamps[-1][1]
+        subaction_name = k + '_' + subaction_id
+
+        output_filename = subaction_name + '.mp4'
+
+        command = [
+            'ffmpeg', '-i',
+            '"%s"' % video_path, '-ss',
+            str(start_time), '-t',
+            str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy',
+            '-threads', '8', '-loglevel', 'panic',
+            '"%s"' % osp.join(subaction_root, output_filename)
+        ]
+        command = ' '.join(command)
+        try:
+            subprocess.check_output(
+                command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError:
+            print(
+                f'Trimming of the Subaction {subaction_name} of Event '
+                f'{k} Failed',
+                flush=True)
diff --git a/tools/data/hacs/README-CN.md b/tools/data/hacs/README-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..cb1ab76727a36d0805f3ab6598b4e536aac03292
--- /dev/null
+++ b/tools/data/hacs/README-CN.md
@@ -0,0 +1,119 @@
+# 准备 HACS Segments
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{zhao2019hacs,
+  title={Hacs: Human action clips and segments dataset for recognition and temporal localization},
+  author={Zhao, Hang and Torralba, Antonio and Torresani, Lorenzo and Yan, Zhicheng},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  pages={8668--8678},
+  year={2019}
+}
+```
+
+### 0. 下载视频
+
+在我们开始准备数据集之前，请按照[官方代码库](https://github.com/hangzhaomit/HACS-dataset)的指令下载HACS Segments数据集中的视频。如果有视频缺失，您可以向HACS数据集存储库的维护者提交请求以获取缺失的视频。但是如果一些视频缺失，您仍然可以为MMAction2准备数据集。
+
+在下载完数据集后，请将数据集文件夹移动到(或者使用软链接)`$MMACTION2/tools/data/hacs/`。文件夹结构应该如下所示：
+
+```
+mmaction2
+├── mmaction
+├── data
+├── configs
+├── tools
+│   ├── hacs
+│   │   ├── slowonly_feature_infer.py
+│   │   ├── ..
+│   │   ├── data
+│   │   │   ├── Applying_sunscreen
+│   │   │   │   ├── v_0Ch__DqMPwA.mp4
+│   │   │   │   ├── v_9CTDjFHl8WE.mp4
+│   │   │   │   ├── ..
+
+
+```
+
+在开始之前，请确保您位于`$MMACTION2/tools/data/hacs/`路径下。
+
+### 1. 提取特征
+
+以下是使用[SlowOnly ResNet50 8x8](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py)在Kinetics700数据集上预训练的模型，从HACS视频中提取特征。对于每个视频，我们均匀采样100个视频片段，并提取700维输出（softmax之前）作为特征，即特征形状为100x700。
+
+首先，我们使用如下命令生成数据集的视频列表：
+
+```
+python generate_list.py
+```
+
+这将生成一个位于`$MMACTION2/tools/data/hacs/`的`hacs_data.txt`文件，其内容格式如下：
+
+```
+Horseback_riding/v_Sr2BSq_8FMw.mp4 0
+Horseback_riding/v_EQb6OKoqz3Q.mp4 1
+Horseback_riding/v_vYKUV8TRngg.mp4 2
+Horseback_riding/v_Y8U0X1F-0ck.mp4 3
+Horseback_riding/v_hnspbB7wNh0.mp4 4
+Horseback_riding/v_HPhlhrT9IOk.mp4 5
+```
+
+接下来，我们使用[slowonly_feature_infer.py](/tools/data/hacs/slowonly_feature_infer.py) 配置文件来提取特征：
+
+```
+# 指定提取特征的GPU数量
+NUM_GPUS=8
+
+# 下载预训练模型权重
+wget https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth
+
+bash ../mmaction2/tools/dist_test.sh \
+    slowonly_feature_infer.py \
+    slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth \
+    $NUM_GPUS --dump result.pkl
+```
+
+我们将得到一个名为 `result.pkl` 的文件，其中包含每个视频的大小为100x700的特征。我们将特征重写为csv格式，并保存在 `$MMACTION2/data/HACS/` 目录下。
+
+```
+＃确保您位于$ $MMACTION2/tools/data/hacs/
+python write_feature_csv.py
+```
+
+### 2. 准备标注文件
+
+我们首先从官方仓库下载标注文件：
+
+```
+wget https://github.com/hangzhaomit/HACS-dataset/raw/master/HACS_v1.1.1.zip
+unzip HACS_v1.1.1.zip
+```
+
+解压缩后，应该有一个名为`HACS_v1.1.1`的文件夹，其中包含一个名为`HACS_segments_v1.1.1.json`的文件。
+
+我们在`$MMACTION2/data/HACS/`目录下生成`hacs_anno_train.json`、`hacs_anno_val.json`和`hacs_anno_test.json`文件：
+
+```
+python3 generate_anotations.py
+```
+
+完成这两个步骤后，HACS Segments数据集的文件夹结构应该如下所示：
+
+```
+mmaction2
+├── mmaction
+├── data
+│   ├── HACS
+│   │   ├── hacs_anno_train.json
+│   │   ├── hacs_anno_val.json
+│   │   ├── hacs_anno_test.json
+│   │   ├── slowonly_feature
+│   │   │   ├── v_008gY2B8Pf4.csv
+│   │   │   ├── v_0095rqic1n8.csv
+├── configs
+├── tools
+
+```
diff --git a/tools/data/hacs/README.md b/tools/data/hacs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ea82c4e858ff217c23399489658a43787e41687f
--- /dev/null
+++ b/tools/data/hacs/README.md
@@ -0,0 +1,119 @@
+# Preparing HACS Segments
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{zhao2019hacs,
+  title={Hacs: Human action clips and segments dataset for recognition and temporal localization},
+  author={Zhao, Hang and Torralba, Antonio and Torresani, Lorenzo and Yan, Zhicheng},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  pages={8668--8678},
+  year={2019}
+}
+```
+
+### Step 0. Download Videos
+
+Before we start preparing the dataset, please following the official [repository](https://github.com/hangzhaomit/HACS-dataset) to download videos from the HACS Segments dataset. You can submit a request for missing videos to the maintainer of the HACS dataset repository. But you can still prepare the dataset for MMAction2 if some videos are missing.
+
+After you finish downloading the dataset, please move the dataset folder to `$MMACTION2/tools/data/hacs/` or use a soft link. The the folder structure should look like:
+
+```
+mmaction2
+├── mmaction
+├── data
+├── configs
+├── tools
+│   ├── hacs
+│   │   ├── slowonly_feature_infer.py
+│   │   ├── ..
+│   │   ├── data
+│   │   │   ├── Applying_sunscreen
+│   │   │   │   ├── v_0Ch__DqMPwA.mp4
+│   │   │   │   ├── v_9CTDjFHl8WE.mp4
+│   │   │   │   ├── ..
+
+
+```
+
+Before we start, make sure you are at `$MMACTION2/tools/data/hacs/`.
+
+### Step 1. Extract Features
+
+We extract features from the HACS videos using [SlowOnly ResNet50 8x8](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py) pretrained on Kinetics700 dataset. For each video, we uniformly sample 100 video clips and extract the 700-dimensional output (before softmax) as the feature, i.e., the feature shape is 100x700.
+
+First, we generate a video list of the dataset:
+
+```
+python generate_list.py
+```
+
+It will generate an `hacs_data.txt` file located at `$MMACTION2/tools/data/hacs/` which looks like:
+
+```
+Horseback_riding/v_Sr2BSq_8FMw.mp4 0
+Horseback_riding/v_EQb6OKoqz3Q.mp4 1
+Horseback_riding/v_vYKUV8TRngg.mp4 2
+Horseback_riding/v_Y8U0X1F-0ck.mp4 3
+Horseback_riding/v_hnspbB7wNh0.mp4 4
+Horseback_riding/v_HPhlhrT9IOk.mp4 5
+```
+
+Next we use the [slowonly_feature_infer.py](/tools/data/hacs/slowonly_feature_infer.py) config to extract features:
+
+```
+# number of GPUs to extract feature
+NUM_GPUS=8
+
+# download the pretraining checkpoint
+wget https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth
+
+bash ../mmaction2/tools/dist_test.sh \
+    slowonly_feature_infer.py \
+    slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth \
+    $NUM_GPUS --dump result.pkl
+```
+
+We will get a `result.pkl` that contains the 100x700 feature for each video. We re-write the features into csv format at `$MMACTION2/data/HACS/`:
+
+```
+# Make sure you are at $MMACTION2/tools/data/hacs/
+python write_feature_csv.py
+```
+
+### Step 2. Prepare Annotations
+
+We first download the original annotations from the official repository:
+
+```
+wget https://github.com/hangzhaomit/HACS-dataset/raw/master/HACS_v1.1.1.zip
+unzip HACS_v1.1.1.zip
+```
+
+After unzipping, there should be an `HACS_v1.1.1` folder with an `HACS_segments_v1.1.1.json` file in it.
+
+We generate `hacs_anno_train.json`,  `hacs_anno_val.json` and `hacs_anno_test.json` files at `$MMACTION2/data/HACS/`:
+
+```
+python3 generate_anotations.py
+```
+
+After the two steps finished, the folder structure of the HACS Segments dataset should look like:
+
+```
+mmaction2
+├── mmaction
+├── data
+│   ├── HACS
+│   │   ├── hacs_anno_train.json
+│   │   ├── hacs_anno_val.json
+│   │   ├── hacs_anno_test.json
+│   │   ├── slowonly_feature
+│   │   │   ├── v_008gY2B8Pf4.csv
+│   │   │   ├── v_0095rqic1n8.csv
+├── configs
+├── tools
+
+```
diff --git a/tools/data/hacs/generate_anotations.py b/tools/data/hacs/generate_anotations.py
new file mode 100644
index 0000000000000000000000000000000000000000..206a6362036675a9ad4d18c58da6330e37df6971
--- /dev/null
+++ b/tools/data/hacs/generate_anotations.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import multiprocessing
+import os
+
+import decord
+
+with open('HACS_v1.1.1/HACS_segments_v1.1.1.json') as f:
+    all_annotations = json.load(f)['database']
+
+
+def parse_anno(key):
+    anno = {}
+    anno['duration_second'] = float(all_annotations[key]['duration'])
+    anno['annotations'] = all_annotations[key]['annotations']
+    anno['subset'] = all_annotations[key]['subset']
+
+    labels = set([i['label'] for i in anno['annotations']])
+    num_frames = int(anno['duration_second'] * 30)
+    for label in labels:
+        path = f'data/{label}/v_{key}.mp4'
+        if os.path.isfile(path):
+            vr = decord.VideoReader(path)
+            num_frames = len(vr)
+            break
+
+    anno['feature_frame'] = anno['duration_frame'] = num_frames
+    anno['key'] = f'v_{key}'
+    return anno
+
+
+pool = multiprocessing.Pool(16)
+video_list = list(all_annotations)
+outputs = pool.map(parse_anno, video_list)
+
+train_anno = {}
+val_anno = {}
+test_anno = {}
+
+for anno in outputs:
+    key = anno.pop('key')
+    subset = anno.pop('subset')
+    if subset == 'training':
+        train_anno[key] = anno
+    elif subset == 'validation':
+        val_anno[key] = anno
+    else:
+        test_anno[key] = anno
+
+outdir = '../../../data/HACS'
+with open(f'{outdir}/hacs_anno_train.json', 'w') as f:
+    json.dump(train_anno, f)
+
+with open(f'{outdir}/hacs_anno_val.json', 'w') as f:
+    json.dump(val_anno, f)
+
+with open(f'{outdir}/hacs_anno_test.json', 'w') as f:
+    json.dump(test_anno, f)
diff --git a/tools/data/hacs/generate_list.py b/tools/data/hacs/generate_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b25d55d11bb9f5115b86c32dc02c11fef46dac
--- /dev/null
+++ b/tools/data/hacs/generate_list.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+data_root = './data'
+
+video_list = []
+idx = 0
+for folder in os.listdir(data_root):
+    path = f'{data_root}/{folder}'
+    for video in os.listdir(path):
+        line = f'{folder}/{video} {idx}\n'
+        idx += 1
+        video_list.append(line)
+
+with open('hacs_data.txt', 'w') as f:
+    for line in video_list:
+        f.write(line)
diff --git a/tools/data/hacs/slowonly_feature_infer.py b/tools/data/hacs/slowonly_feature_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b52b04312077690012996747d19f4a149f7fc7b
--- /dev/null
+++ b/tools/data/hacs/slowonly_feature_infer.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+default_hooks = dict(
+    runtime_info=dict(type='RuntimeInfoHook'),
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=20, ignore_last=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=4, save_best='auto'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    sync_buffers=dict(type='SyncBuffersHook'))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='ActionVisualizer', vis_backends=[dict(type='LocalVisBackend')])
+log_level = 'INFO'
+
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=50,
+        lateral=False,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(0, 0, 1, 1),
+        norm_eval=False),
+    cls_head=dict(
+        type='I3DHead',
+        in_channels=2048,
+        num_classes=700,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        average_clips=None),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
+
+data_root = './data'
+ann_file = 'hacs_data.txt'
+
+test_pipeline = [
+    dict(type='DecordInit', io_backend='disk'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=100,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='VideoDataset',
+        ann_file=ann_file,
+        data_prefix=dict(video=data_root),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_evaluator = dict(type='DumpResults', out_file_path='result.pkl')
+test_cfg = dict(type='TestLoop')
diff --git a/tools/data/hacs/write_feature_csv.py b/tools/data/hacs/write_feature_csv.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b0f34a907a49c3f149d4eb0168d735a937307f
--- /dev/null
+++ b/tools/data/hacs/write_feature_csv.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine
+
+features = mmengine.load('result.pkl')
+video_list = mmengine.list_from_file('hacs_data.txt')
+feature_dir = '../../../data/HACS/slowonly_feature'
+mmengine.mkdir_or_exist(feature_dir)
+
+head = ','.join([f'f{i}' for i in range(700)]) + '\n'
+
+for feature, video in zip(features, video_list):
+    video_id = video.split()[0].split('/')[1]
+    csv_file = video_id.replace('mp4', 'csv')
+    feat = feature['pred_scores']['item'].numpy()
+    feat = feat.tolist()
+    csv_path = f'{feature_dir}/{csv_file}'
+    with open(csv_path, 'w') as f:
+        f.write(head)
+        for line in feat:
+            f.write(str(line)[1:-1] + '\n')
diff --git a/tools/data/hmdb51/README.md b/tools/data/hmdb51/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3417f22944d8728c998b10f8e04091136dd8fea6
--- /dev/null
+++ b/tools/data/hmdb51/README.md
@@ -0,0 +1,125 @@
+# Preparing HMDB51
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{Kuehne2011HMDBAL,
+  title={HMDB: A large video database for human motion recognition},
+  author={Hilde Kuehne and Hueihan Jhuang and E. Garrote and T. Poggio and Thomas Serre},
+  journal={2011 International Conference on Computer Vision},
+  year={2011},
+  pages={2556-2563}
+}
+```
+
+For basic dataset information, you can refer to the dataset [website](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/hmdb51/`.
+
+To run the bash scripts below, you need to install `unrar`. you can install it by `sudo apt-get install unrar`,
+or refer to [this repo](https://github.com/innerlee/setup) by following the usage and taking [`zzunrar.sh`](https://github.com/innerlee/setup/blob/master/zzunrar.sh)
+script for easy installation without sudo.
+
+## Step 1. Prepare Annotations
+
+First of all, you can run the following script to prepare annotations.
+
+```shell
+bash download_annotations.sh
+```
+
+## Step 2. Prepare Videos
+
+Then, you can run the following script to prepare videos.
+
+```shell
+bash download_videos.sh
+```
+
+## Step 3. Extract RGB and Flow
+
+This part is **optional** if you only want to use the video loader.
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
+
+You can run the following script to soft link SSD.
+
+```shell
+# execute these two line (Assume the SSD is mounted at "/mnt/SSD/")
+mkdir /mnt/SSD/hmdb51_extracted/
+ln -s /mnt/SSD/hmdb51_extracted/ ../../../data/hmdb51/rawframes
+```
+
+If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow.
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images.
+
+```shell
+bash extract_rgb_frames_opencv.sh
+```
+
+If both are required, run the following script to extract frames using "tvl1" algorithm.
+
+```shell
+bash extract_frames.sh
+```
+
+## Step 4. Generate File List
+
+you can run the follow script to generate file list in the format of rawframes and videos.
+
+```shell
+bash generate_rawframes_filelist.sh
+bash generate_videos_filelist.sh
+```
+
+## Step 5. Check Directory Structure
+
+After the whole data process for HMDB51 preparation,
+you will get the rawframes (RGB + Flow), videos and annotation files for HMDB51.
+
+In the context of the whole project (for HMDB51 only), the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── hmdb51
+│   │   ├── hmdb51_{train,val}_split_{1,2,3}_rawframes.txt
+│   │   ├── hmdb51_{train,val}_split_{1,2,3}_videos.txt
+│   │   ├── annotations
+│   │   ├── videos
+│   │   │   ├── brush_hair
+│   │   │   │   ├── April_09_brush_hair_u_nm_np1_ba_goo_0.avi
+
+│   │   │   ├── wave
+│   │   │   │   ├── 20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0.avi
+│   │   ├── rawframes
+│   │   │   ├── brush_hair
+│   │   │   │   ├── April_09_brush_hair_u_nm_np1_ba_goo_0
+│   │   │   │   │   ├── img_00001.jpg
+│   │   │   │   │   ├── img_00002.jpg
+│   │   │   │   │   ├── ...
+│   │   │   │   │   ├── flow_x_00001.jpg
+│   │   │   │   │   ├── flow_x_00002.jpg
+│   │   │   │   │   ├── ...
+│   │   │   │   │   ├── flow_y_00001.jpg
+│   │   │   │   │   ├── flow_y_00002.jpg
+│   │   │   ├── ...
+│   │   │   ├── wave
+│   │   │   │   ├── 20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0
+│   │   │   │   ├── ...
+│   │   │   │   ├── winKen_wave_u_cm_np1_ri_bad_1
+
+```
+
+For training and evaluating on HMDB51, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/hmdb51/README_zh-CN.md b/tools/data/hmdb51/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..60d01d3c6e6ec9c8b2e23abb40d00da06be5fc9a
--- /dev/null
+++ b/tools/data/hmdb51/README_zh-CN.md
@@ -0,0 +1,121 @@
+# 准备 HMDB51
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{Kuehne2011HMDBAL,
+  title={HMDB: A large video database for human motion recognition},
+  author={Hilde Kuehne and Hueihan Jhuang and E. Garrote and T. Poggio and Thomas Serre},
+  journal={2011 International Conference on Computer Vision},
+  year={2011},
+  pages={2556-2563}
+}
+```
+
+用户可以参照数据集 [官网](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/)，获取数据集相关的基本信息。
+在准备数据集前，请确保命令行当前路径为 `$MMACTION2/tools/data/hmdb51/`。
+
+为运行下面的 bash 脚本，需要安装 `unrar`。用户可运行 `sudo apt-get install unrar` 安装，或参照 [setup](https://github.com/innerlee/setup)，运行 [`zzunrar.sh`](https://github.com/innerlee/setup/blob/master/zzunrar.sh) 脚本实现无管理员权限下的简易安装。
+
+## 步骤 1. 下载标注文件
+
+首先，用户可使用以下命令下载标注文件。
+
+```shell
+bash download_annotations.sh
+```
+
+## 步骤 2. 下载视频
+
+之后，用户可使用以下指令下载视频
+
+```shell
+bash download_videos.sh
+```
+
+## 步骤 3. 抽取帧和光流
+
+如果用户只想使用视频加载训练，则该部分是 **可选项**。
+
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+如果用户有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 上。
+用户可使用以下命令为 SSD 建立软链接。
+
+```shell
+# 执行这两行指令进行抽取（假设 SSD 挂载在 "/mnt/SSD/"上）
+mkdir /mnt/SSD/hmdb51_extracted/
+ln -s /mnt/SSD/hmdb51_extracted/ ../../../data/hmdb51/rawframes
+```
+
+如果用户需要抽取 RGB 帧（因为抽取光流的过程十分耗时），可以考虑运行以下命令使用 denseflow **只抽取 RGB 帧**。
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+如果用户没有安装 denseflow，则可以运行以下命令使用 OpenCV 抽取 RGB 帧。然而，该方法只能抽取与原始视频分辨率相同的帧。
+
+```shell
+bash extract_rgb_frames_opencv.sh
+```
+
+如果用户想抽取 RGB 帧和光流，则可以运行以下脚本，使用 "tvl1" 算法进行抽取。
+
+```shell
+bash extract_frames.sh
+```
+
+## 步骤 4. 生成文件列表
+
+用户可以通过运行以下命令生成帧和视频格式的文件列表。
+
+```shell
+bash generate_rawframes_filelist.sh
+bash generate_videos_filelist.sh
+```
+
+## 步骤 5. 检查目录结构
+
+在完成 HMDB51 数据集准备流程后，用户可以得到 HMDB51 的 RGB 帧 + 光流文件，视频文件以及标注文件。
+
+在整个 MMAction2 文件夹下，HMDB51 的文件结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── hmdb51
+│   │   ├── hmdb51_{train,val}_split_{1,2,3}_rawframes.txt
+│   │   ├── hmdb51_{train,val}_split_{1,2,3}_videos.txt
+│   │   ├── annotations
+│   │   ├── videos
+│   │   │   ├── brush_hair
+│   │   │   │   ├── April_09_brush_hair_u_nm_np1_ba_goo_0.avi
+
+│   │   │   ├── wave
+│   │   │   │   ├── 20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0.avi
+│   │   ├── rawframes
+│   │   │   ├── brush_hair
+│   │   │   │   ├── April_09_brush_hair_u_nm_np1_ba_goo_0
+│   │   │   │   │   ├── img_00001.jpg
+│   │   │   │   │   ├── img_00002.jpg
+│   │   │   │   │   ├── ...
+│   │   │   │   │   ├── flow_x_00001.jpg
+│   │   │   │   │   ├── flow_x_00002.jpg
+│   │   │   │   │   ├── ...
+│   │   │   │   │   ├── flow_y_00001.jpg
+│   │   │   │   │   ├── flow_y_00002.jpg
+│   │   │   ├── ...
+│   │   │   ├── wave
+│   │   │   │   ├── 20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0
+│   │   │   │   ├── ...
+│   │   │   │   ├── winKen_wave_u_cm_np1_ri_bad_1
+
+```
+
+关于对 HMDB51 进行训练和验证，可以参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/hmdb51/download_annotations.sh b/tools/data/hmdb51/download_annotations.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bab3a4b9394eaa479dbbd43586c7068cc581ca60
--- /dev/null
+++ b/tools/data/hmdb51/download_annotations.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+set -e
+
+DATA_DIR="../../../data/hmdb51/annotations"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+cd ${DATA_DIR}
+wget http://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/test_train_splits.rar --no-check-certificate
+
+# sudo apt-get install unrar
+unrar x test_train_splits.rar
+rm test_train_splits.rar
+
+mv  testTrainMulti_7030_splits/*.txt ./
+rmdir testTrainMulti_7030_splits
+
+cd -
diff --git a/tools/data/hmdb51/download_videos.sh b/tools/data/hmdb51/download_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..76dbede5f8ec1d648e0a2142b151f55d336e648e
--- /dev/null
+++ b/tools/data/hmdb51/download_videos.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+set -e
+
+DATA_DIR="../../../data/hmdb51/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+cd ${DATA_DIR}
+
+mkdir -p ./videos
+cd ./videos
+
+wget http://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/hmdb51_org.rar --no-check-certificate
+
+# sudo apt-get install unrar
+unrar x ./hmdb51_org.rar
+rm ./hmdb51_org.rar
+
+# extract all rar files with full path
+for file in *.rar; do unrar x $file; done
+
+rm ./*.rar
+cd "../../../tools/data/hmdb51"
diff --git a/tools/data/hmdb51/extract_frames.sh b/tools/data/hmdb51/extract_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..95df7125543b5d3a5ca1ce955b5f1ae739dfe0bd
--- /dev/null
+++ b/tools/data/hmdb51/extract_frames.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/hmdb51/videos/ ../../data/hmdb51/rawframes/ --task both --level 2 --flow-type tvl1
+echo "Raw frames (RGB and Flow) Generated"
+cd hmdb51/
diff --git a/tools/data/hmdb51/extract_rgb_frames.sh b/tools/data/hmdb51/extract_rgb_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9c8865e40451a2b20f52d7fabd09c9cc42581ea2
--- /dev/null
+++ b/tools/data/hmdb51/extract_rgb_frames.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/hmdb51/videos/ ../../data/hmdb51/rawframes/ --task rgb --level 2  --ext avi
+echo "Genearte raw frames (RGB only)"
+
+cd hmdb51/
diff --git a/tools/data/hmdb51/extract_rgb_frames_opencv.sh b/tools/data/hmdb51/extract_rgb_frames_opencv.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9e257635608a1871e6e4af9d8d7d615540d22da5
--- /dev/null
+++ b/tools/data/hmdb51/extract_rgb_frames_opencv.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/hmdb51/videos/ ../../data/hmdb51/rawframes/ --task rgb --level 2 --ext avi --use-opencv
+echo "Genearte raw frames (RGB only)"
+
+cd hmdb51/
diff --git a/tools/data/hmdb51/generate_rawframes_filelist.sh b/tools/data/hmdb51/generate_rawframes_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4fc16d55d96522162f145d442bfda1c98d676264
--- /dev/null
+++ b/tools/data/hmdb51/generate_rawframes_filelist.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../../../
+
+PYTHONPATH=. python tools/data/build_file_list.py hmdb51 data/hmdb51/rawframes/ --level 2 --format rawframes --shuffle
+echo "Filelist for rawframes generated."
+
+cd tools/data/hmdb51/
diff --git a/tools/data/hmdb51/generate_videos_filelist.sh b/tools/data/hmdb51/generate_videos_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9d6fe7dcfb91a9baa62695892f1bd183b39cbd7b
--- /dev/null
+++ b/tools/data/hmdb51/generate_videos_filelist.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../../../
+
+PYTHONPATH=. python tools/data/build_file_list.py hmdb51 data/hmdb51/videos/ --level 2 --format videos --shuffle
+echo "Filelist for videos generated."
+
+cd tools/data/hmdb51/
diff --git a/tools/data/hmdb51/label_map.txt b/tools/data/hmdb51/label_map.txt
new file mode 100644
index 0000000000000000000000000000000000000000..852af0f99d13744a041bd2fe50adc6437c86c4ed
--- /dev/null
+++ b/tools/data/hmdb51/label_map.txt
@@ -0,0 +1,51 @@
+brush_hair
+cartwheel
+catch
+chew
+clap
+climb
+climb_stairs
+dive
+draw_sword
+dribble
+drink
+eat
+fall_floor
+fencing
+flic_flac
+golf
+handstand
+hit
+hug
+jump
+kick
+kick_ball
+kiss
+laugh
+pick
+pour
+pullup
+punch
+push
+pushup
+ride_bike
+ride_horse
+run
+shake_hands
+shoot_ball
+shoot_bow
+shoot_gun
+sit
+situp
+smile
+smoke
+somersault
+stand
+swing_baseball
+sword
+sword_exercise
+talk
+throw
+turn
+walk
+wave
diff --git a/tools/data/hvu/README.md b/tools/data/hvu/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d31a8725340f39aade2589da6b26e3af9503773d
--- /dev/null
+++ b/tools/data/hvu/README.md
@@ -0,0 +1,123 @@
+# Preparing HVU
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{Diba2019LargeSH,
+  title={Large Scale Holistic Video Understanding},
+  author={Ali Diba and M. Fayyaz and Vivek Sharma and Manohar Paluri and Jurgen Gall and R. Stiefelhagen and L. Gool},
+  journal={arXiv: Computer Vision and Pattern Recognition},
+  year={2019}
+}
+```
+
+For basic dataset information, please refer to the official [project](https://github.com/holistic-video-understanding/HVU-Dataset/) and the [paper](https://arxiv.org/abs/1904.11451).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/hvu/`.
+
+## Step 1. Prepare Annotations
+
+First of all, you can run the following script to prepare annotations.
+
+```shell
+bash download_annotations.sh
+```
+
+Besides, you need to run the following command to parse the tag list of HVU.
+
+```shell
+python parse_tag_list.py
+```
+
+## Step 2. Prepare Videos
+
+Then, you can run the following script to prepare videos.
+The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time.
+
+```shell
+bash download_videos.sh
+```
+
+## Step 3. Extract RGB and Flow
+
+This part is **optional** if you only want to use the video loader.
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+You can use the following script to extract both RGB and Flow frames.
+
+```shell
+bash extract_frames.sh
+```
+
+By default, we generate frames with short edge resized to 256.
+More details can be found in [prepare_dataset](/docs/en/user_guides/prepare_dataset.md)
+
+## Step 4. Generate File List
+
+You can run the follow scripts to generate file list in the format of videos and rawframes, respectively.
+
+```shell
+bash generate_videos_filelist.sh
+# execute the command below when rawframes are ready
+bash generate_rawframes_filelist.sh
+```
+
+## Step 5. Generate File List for Each Individual Tag Categories
+
+This part is **optional** if you don't want to train models on HVU for a specific tag category.
+
+The file list generated in step 4 contains labels of different categories. These file lists can only be
+handled with HVUDataset and used for multi-task learning of different tag categories. The component
+`LoadHVULabel` is needed to load the multi-category tags, and the `HVULoss` should be used to train
+the model.
+
+If you only want to train video recognition models for a specific tag category, i.e. you want to train
+a recognition model on HVU which only handles tags in the category `action`, we recommend you to use
+the following command to generate file lists for the specific tag category. The new list, which only
+contains tags of a specific category, can be handled with `VideoDataset` or `RawframeDataset`. The
+recognition models can be trained with `BCELossWithLogits`.
+
+The following command generates file list for the tag category ${category}, note that the tag category you
+specified should be in the 6 tag categories available in HVU: \['action', 'attribute', 'concept', 'event',
+'object', 'scene'\].
+
+```shell
+python generate_sub_file_list.py path/to/filelist.json ${category}
+```
+
+The filename of the generated file list for ${category} is generated by replacing `hvu` in the original
+filename with `hvu_${category}`. For example, if the original filename is `hvu_train.json`, the filename
+of the file list for action is `hvu_action_train.json`.
+
+## Step 6. Folder Structure
+
+After the whole data pipeline for HVU preparation.
+you can get the rawframes (RGB + Flow), videos and annotation files for HVU.
+
+In the context of the whole project (for HVU only), the full folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── hvu
+│   │   ├── hvu_train_video.json
+│   │   ├── hvu_val_video.json
+│   │   ├── hvu_train.json
+│   │   ├── hvu_val.json
+│   │   ├── annotations
+│   │   ├── videos_train
+│   │   │   ├── OLpWTpTC4P8_000570_000670.mp4
+│   │   │   ├── xsPKW4tZZBc_002330_002430.mp4
+│   │   │   ├── ...
+│   │   ├── videos_val
+│   │   ├── rawframes_train
+│   │   ├── rawframes_val
+
+```
+
+For training and evaluating on HVU, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/hvu/README_zh-CN.md b/tools/data/hvu/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..5bb1185c9c87db52a448dfea590bad20dc6534a3
--- /dev/null
+++ b/tools/data/hvu/README_zh-CN.md
@@ -0,0 +1,110 @@
+# 准备 HVU
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{Diba2019LargeSH,
+  title={Large Scale Holistic Video Understanding},
+  author={Ali Diba and M. Fayyaz and Vivek Sharma and Manohar Paluri and Jurgen Gall and R. Stiefelhagen and L. Gool},
+  journal={arXiv: Computer Vision and Pattern Recognition},
+  year={2019}
+}
+```
+
+请参照 [官方项目](https://github.com/holistic-video-understanding/HVU-Dataset/) 及 [原论文](https://arxiv.org/abs/1904.11451) 以获取数据集基本信息。
+在开始之前，用户需确保当前目录为 `$MMACTION2/tools/data/hvu/`。
+
+## 1. 准备标注文件
+
+首先，用户可以使用如下脚本下载标注文件并进行预处理：
+
+```shell
+bash download_annotations.sh
+```
+
+此外，用户可使用如下命令解析 HVU 的标签列表：
+
+```shell
+python parse_tag_list.py
+```
+
+## 2. 准备视频
+
+用户可以使用以下脚本准备视频，视频准备代码修改自 [ActivityNet 爬虫](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)。
+注意这一步骤将花费较长时间。
+
+```shell
+bash download_videos.sh
+```
+
+## 3. 提取 RGB 帧和光流
+
+如果用户仅使用 video loader，则可以跳过本步。
+
+在提取之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+用户可使用如下脚本同时抽取 RGB 帧和光流：
+
+```shell
+bash extract_frames.sh
+```
+
+该脚本默认生成短边长度为 256 的帧，可参考 [数据准备](/docs/zh_cn/user_guides/prepare_dataset.md) 获得更多细节。
+
+## 4. 生成文件列表
+
+用户可以使用以下两个脚本分别为视频和帧文件夹生成文件列表：
+
+```shell
+bash generate_videos_filelist.sh
+# 为帧文件夹生成文件列表
+bash generate_rawframes_filelist.sh
+```
+
+## 5. 为每个 tag 种类生成文件列表
+
+若用户需要为 HVU 数据集的每个 tag 种类训练识别模型，则需要进行此步骤。
+
+步骤 4 中生成的文件列表包含不同类型的标签，仅支持使用 HVUDataset 进行涉及多个标签种类的多任务学习。加载数据的过程中需要使用 `LoadHVULabel` 类进行多类别标签的加载，训练过程中使用 `HVULoss` 作为损失函数。
+
+如果用户仅需训练某一特定类别的标签，例如训练一识别模型用于识别 HVU 中 `action` 类别的标签，则建议使用如下脚本为特定标签种类生成文件列表。新生成的列表将只含有特定类别的标签，因此可使用 `VideoDataset` 或 `RawframeDataset` 进行加载。训训练过程中使用 `BCELossWithLogits` 作为损失函数。
+
+以下脚本为类别为 ${category} 的标签生成文件列表，注意仅支持 HVU 数据集包含的 6 种标签类别: action, attribute, concept, event, object, scene。
+
+```shell
+python generate_sub_file_list.py path/to/filelist.json ${category}
+```
+
+对于类别 ${category}，生成的标签列表文件名中将使用 `hvu_${category}` 替代 `hvu`。例如，若原指定文件名为 `hvu_train.json`，则对于类别 action，生成的文件列表名为 `hvu_action_train.json`。
+
+## 6. 目录结构
+
+在完整完成 HVU 的数据处理后，将得到帧文件夹（RGB 帧和光流帧），视频以及标注文件。
+
+在整个项目目录下（仅针对 HVU），完整目录结构如下所示：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── hvu
+│   │   ├── hvu_train_video.json
+│   │   ├── hvu_val_video.json
+│   │   ├── hvu_train.json
+│   │   ├── hvu_val.json
+│   │   ├── annotations
+│   │   ├── videos_train
+│   │   │   ├── OLpWTpTC4P8_000570_000670.mp4
+│   │   │   ├── xsPKW4tZZBc_002330_002430.mp4
+│   │   │   ├── ...
+│   │   ├── videos_val
+│   │   ├── rawframes_train
+│   │   ├── rawframes_val
+
+```
+
+关于 HVU 数据集上的训练与测试，请参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/hvu/download.py b/tools/data/hvu/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..c86b4da6cac0311bf0ea560151ad5ecbb18b3d48
--- /dev/null
+++ b/tools/data/hvu/download.py
@@ -0,0 +1,203 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/activitynet/ActivityNet/
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import argparse
+import glob
+import os
+import shutil
+import ssl
+import subprocess
+import uuid
+
+import mmengine
+from joblib import Parallel, delayed
+
+ssl._create_default_https_context = ssl._create_unverified_context
+args = None
+
+
+def create_video_folders(output_dir, tmp_dir):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+
+
+def construct_video_filename(item, trim_format, output_dir):
+    """Given a dataset row, this function constructs the output filename for a
+    given video."""
+    youtube_id, start_time, end_time = item
+    start_time, end_time = int(start_time * 10), int(end_time * 10)
+    basename = '%s_%s_%s.mp4' % (youtube_id, trim_format % start_time,
+                                 trim_format % end_time)
+    output_filename = os.path.join(output_dir, basename)
+    return output_filename
+
+
+def download_clip(video_identifier,
+                  output_filename,
+                  start_time,
+                  end_time,
+                  tmp_dir='/tmp/hvu/.tmp_dir',
+                  num_attempts=5,
+                  url_base='https://www.youtube.com/watch?v='):
+    """Download a video from youtube if exists and is not blocked.
+    arguments:
+    ---------
+    video_identifier: str
+        Unique YouTube video identifier (11 characters)
+    output_filename: str
+        File path where the video will be stored.
+    start_time: float
+        Indicates the beginning time in seconds from where the video
+        will be trimmed.
+    end_time: float
+        Indicates the ending time in seconds of the trimmed video.
+    """
+    # Defensive argument checking.
+    assert isinstance(video_identifier, str), 'video_identifier must be string'
+    assert isinstance(output_filename, str), 'output_filename must be string'
+    assert len(video_identifier) == 11, 'video_identifier must have length 11'
+
+    status = False
+    tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4())
+
+    if not os.path.exists(output_filename):
+        if not os.path.exists(tmp_filename):
+            command = [
+                'youtube-dl', '--quiet', '--no-warnings',
+                '--no-check-certificate', '-f', 'mp4', '-o',
+                '"%s"' % tmp_filename,
+                '"%s"' % (url_base + video_identifier)
+            ]
+            command = ' '.join(command)
+            print(command)
+            attempts = 0
+            while True:
+                try:
+                    subprocess.check_output(
+                        command, shell=True, stderr=subprocess.STDOUT)
+                except subprocess.CalledProcessError:
+                    attempts += 1
+                    if attempts == num_attempts:
+                        return status, 'Downloading Failed'
+                else:
+                    break
+
+        tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0]
+        # Construct command to trim the videos (ffmpeg required).
+        command = [
+            'ffmpeg', '-i',
+            '"%s"' % tmp_filename, '-ss',
+            str(start_time), '-t',
+            str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy',
+            '-threads', '1', '-loglevel', 'panic',
+            '"%s"' % output_filename
+        ]
+        command = ' '.join(command)
+        try:
+            subprocess.check_output(
+                command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError:
+            return status, 'Trimming Failed'
+
+    # Check if the video was successfully saved.
+    status = os.path.exists(output_filename)
+    os.remove(tmp_filename)
+    return status, 'Downloaded'
+
+
+def download_clip_wrapper(item, trim_format, tmp_dir, output_dir):
+    """Wrapper for parallel processing purposes."""
+    output_filename = construct_video_filename(item, trim_format, output_dir)
+    clip_id = os.path.basename(output_filename).split('.mp4')[0]
+    if os.path.exists(output_filename):
+        status = tuple([clip_id, True, 'Exists'])
+        return status
+
+    youtube_id, start_time, end_time = item
+    downloaded, log = download_clip(
+        youtube_id, output_filename, start_time, end_time, tmp_dir=tmp_dir)
+
+    status = tuple([clip_id, downloaded, log])
+    return status
+
+
+def parse_hvu_annotations(input_csv):
+    """Returns a parsed DataFrame.
+    arguments:
+    ---------
+    input_csv: str
+        Path to CSV file containing the following columns:
+          'Tags, youtube_id, time_start, time_end'
+    returns:
+    -------
+    dataset: List of tuples. Each tuple consists of
+        (youtube_id, time_start, time_end). The type of time is float.
+    """
+    lines = open(input_csv).readlines()
+    lines = [x.strip().split(',')[1:] for x in lines[1:]]
+
+    lines = [(x[0], float(x[1]), float(x[2])) for x in lines]
+
+    return lines
+
+
+def main(input_csv,
+         output_dir,
+         trim_format='%06d',
+         num_jobs=24,
+         tmp_dir='/tmp/hvu'):
+
+    tmp_dir = os.path.join(tmp_dir, '.tmp_dir')
+
+    # Reading and parsing HVU.
+    dataset = parse_hvu_annotations(input_csv)
+
+    # Creates folders where videos will be saved later.
+    create_video_folders(output_dir, tmp_dir)
+
+    # Download all clips.
+    if num_jobs == 1:
+        status_lst = []
+        for item in dataset:
+            status_lst.append(
+                download_clip_wrapper(item, trim_format, tmp_dir, output_dir))
+    else:
+        status_lst = Parallel(n_jobs=num_jobs)(
+            delayed(download_clip_wrapper)(item, trim_format, tmp_dir,
+                                           output_dir) for item in dataset)
+
+    # Clean tmp dir.
+    shutil.rmtree(tmp_dir)
+    # Save download report.
+    mmengine.dump(status_lst, 'download_report.json')
+
+
+if __name__ == '__main__':
+    description = 'Helper script for downloading and trimming HVU videos.'
+    p = argparse.ArgumentParser(description=description)
+    p.add_argument(
+        'input_csv',
+        type=str,
+        help=('CSV file containing the following format: '
+              'Tags, youtube_id, time_start, time_end'))
+    p.add_argument(
+        'output_dir',
+        type=str,
+        help='Output directory where videos will be saved.')
+    p.add_argument(
+        '-f',
+        '--trim-format',
+        type=str,
+        default='%06d',
+        help=('This will be the format for the '
+              'filename of trimmed videos: '
+              'videoid_%0xd(start_time)_%0xd(end_time).mp4. '
+              'Note that the start_time is multiplied by 10 since '
+              'decimal exists somewhere. '))
+    p.add_argument('-n', '--num-jobs', type=int, default=24)
+    p.add_argument('-t', '--tmp-dir', type=str, default='/tmp/hvu')
+    main(**vars(p.parse_args()))
diff --git a/tools/data/hvu/download_annotations.sh b/tools/data/hvu/download_annotations.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a247cc12c6d7faa634c609fb7a566353883bda19
--- /dev/null
+++ b/tools/data/hvu/download_annotations.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+set -e
+
+DATA_DIR="../../../data/hvu/annotations"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+git clone https://github.com/holistic-video-understanding/HVU-Dataset.git
+
+cd HVU-Dataset
+unzip -o HVU_Train_V1.0.zip
+unzip -o HVU_Val_V1.0.zip
+cd ..
+mv HVU-Dataset/HVU_Train_V1.0.csv ${DATA_DIR}/hvu_train.csv
+mv HVU-Dataset/HVU_Val_V1.0.csv ${DATA_DIR}/hvu_val.csv
+mv HVU-Dataset/HVU_Tags_Categories_V1.0.csv ${DATA_DIR}/hvu_categories.csv
+
+rm -rf HVU-Dataset
diff --git a/tools/data/hvu/download_videos.sh b/tools/data/hvu/download_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5d2b7167f27398de908f5e7fe10b5fe2e4be88ea
--- /dev/null
+++ b/tools/data/hvu/download_videos.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+# set up environment
+conda env create -f environment.yml
+source activate hvu
+pip install mmcv
+pip install --upgrade youtube-dl
+
+DATA_DIR="../../../data/hvu"
+ANNO_DIR="../../../data/hvu/annotations"
+python download.py ${ANNO_DIR}/hvu_train.csv ${DATA_DIR}/videos_train
+python download.py ${ANNO_DIR}/hvu_val.csv ${DATA_DIR}/videos_val
+
+source deactivate hvu
+conda remove -n hvu --all
diff --git a/tools/data/hvu/environment.yml b/tools/data/hvu/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b6d9959e88a91881de1be2d38928c63e9aa79938
--- /dev/null
+++ b/tools/data/hvu/environment.yml
@@ -0,0 +1,36 @@
+name: kinetics
+channels:
+  - anaconda
+  - menpo
+  - conda-forge
+  - defaults
+dependencies:
+  - ca-certificates=2020.1.1
+  - certifi=2020.4.5.1
+  - ffmpeg=2.8.6
+  - libcxx=10.0.0
+  - libedit=3.1.20181209
+  - libffi=3.3
+  - ncurses=6.2
+  - openssl=1.1.1g
+  - pip=20.0.2
+  - python=3.7.7
+  - readline=8.0
+  - setuptools=46.4.0
+  - sqlite=3.31.1
+  - tk=8.6.8
+  - wheel=0.34.2
+  - xz=5.2.5
+  - zlib=1.2.11
+  - pip:
+    - decorator==4.4.2
+    - intel-openmp==2019.0
+    - joblib==0.15.1
+    - mkl==2019.0
+    - numpy==1.18.4
+    - olefile==0.46
+    - pandas==1.0.3
+    - python-dateutil==2.8.1
+    - pytz==2020.1
+    - six==1.14.0
+    - youtube-dl
diff --git a/tools/data/hvu/extract_frames.sh b/tools/data/hvu/extract_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c81814ccd39219b3836cc0fc2bfe4a6ce929a57d
--- /dev/null
+++ b/tools/data/hvu/extract_frames.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/hvu/videos_train/ ../../data/hvu/rawframes_train/ --level 1 --flow-type tvl1 --ext mp4 --task both  --new-short 256
+echo "Raw frames (RGB and tv-l1) Generated for train set"
+
+python build_rawframes.py ../../data/hvu/videos_val/ ../../data/hvu/rawframes_val/ --level 1 --flow-type tvl1 --ext mp4 --task both  --new-short 256
+echo "Raw frames (RGB and tv-l1) Generated for val set"
+
+cd hvu/
diff --git a/tools/data/hvu/generate_file_list.py b/tools/data/hvu/generate_file_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbfba8ace63dd717747b54124cb624c8c4e921e
--- /dev/null
+++ b/tools/data/hvu/generate_file_list.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import fnmatch
+import glob
+import os
+import os.path as osp
+
+import mmengine
+
+annotation_root = '../../data/hvu/annotations'
+tag_file = 'hvu_tags.json'
+args = None
+
+
+def parse_directory(path,
+                    rgb_prefix='img_',
+                    flow_x_prefix='flow_x_',
+                    flow_y_prefix='flow_y_',
+                    level=1):
+    """Parse directories holding extracted frames from standard benchmarks.
+
+    Args:
+        path (str): Directory path to parse frames.
+        rgb_prefix (str): Prefix of generated rgb frames name.
+            default: 'img_'.
+        flow_x_prefix (str): Prefix of generated flow x name.
+            default: `flow_x_`.
+        flow_y_prefix (str): Prefix of generated flow y name.
+            default: `flow_y_`.
+        level (int): Directory level for glob searching. Options are 1 and 2.
+            default: 1.
+
+    Returns:
+        dict: frame info dict with video id as key and tuple(path(str),
+            rgb_num(int), flow_x_num(int)) as value.
+    """
+    print(f'parse frames under directory {path}')
+    if level == 1:
+        # Only search for one-level directory
+        def locate_directory(x):
+            return osp.basename(x)
+
+        frame_dirs = glob.glob(osp.join(path, '*'))
+
+    elif level == 2:
+        # search for two-level directory
+        def locate_directory(x):
+            return osp.join(osp.basename(osp.dirname(x)), osp.basename(x))
+
+        frame_dirs = glob.glob(osp.join(path, '*', '*'))
+
+    else:
+        raise ValueError('level can be only 1 or 2')
+
+    def count_files(directory, prefix_list):
+        """Count file number with a given directory and prefix.
+
+        Args:
+            directory (str): Data directory to be search.
+            prefix_list (list): List or prefix.
+
+        Returns:
+            list (int): Number list of the file with the prefix.
+        """
+        lst = os.listdir(directory)
+        cnt_list = [len(fnmatch.filter(lst, x + '*')) for x in prefix_list]
+        return cnt_list
+
+    # check RGB
+    frame_dict = {}
+    for i, frame_dir in enumerate(frame_dirs):
+        total_num = count_files(frame_dir,
+                                (rgb_prefix, flow_x_prefix, flow_y_prefix))
+        dir_name = locate_directory(frame_dir)
+
+        num_x = total_num[1]
+        num_y = total_num[2]
+        if num_x != num_y:
+            raise ValueError(f'x and y direction have different number '
+                             f'of flow images in video directory: {frame_dir}')
+        if i % 200 == 0:
+            print(f'{i} videos parsed')
+
+        frame_dict[dir_name] = (frame_dir, total_num[0], num_x)
+
+    print('frame directory analysis done')
+    return frame_dict
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='build file list for HVU')
+    parser.add_argument('--input_csv', type=str, help='path of input csv file')
+    parser.add_argument(
+        '--src_dir', type=str, help='source video / frames directory')
+    parser.add_argument(
+        '--output',
+        type=str,
+        help='output filename, should \
+        ends with .json')
+    parser.add_argument(
+        '--mode',
+        type=str,
+        choices=['frames', 'videos'],
+        help='generate file list for frames or videos')
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    tag_cates = mmengine.load(tag_file)
+    tag2category = {}
+    for k in tag_cates:
+        for tag in tag_cates[k]:
+            tag2category[tag] = k
+
+    data_list = open(args.input_csv).readlines()
+    data_list = [x.strip().split(',') for x in data_list[1:]]
+
+    if args.mode == 'videos':
+        downloaded = os.listdir(args.src_dir)
+        downloaded = [x.split('.')[0] for x in downloaded]
+        downloaded_set = set(downloaded)
+    else:
+        parse_result = parse_directory(args.src_dir)
+        downloaded_set = set(parse_result)
+
+    def parse_line(line):
+        tags, youtube_id, start, end = line
+        start, end = int(float(start) * 10), int(float(end) * 10)
+        newname = f'{youtube_id}_{start:06d}_{end:06d}'
+        tags = tags.split('|')
+        all_tags = {}
+        for tag in tags:
+            category = tag2category[tag]
+            all_tags.setdefault(category,
+                                []).append(tag_cates[category].index(tag))
+        return newname, all_tags
+
+    data_list = [parse_line(line) for line in data_list]
+    data_list = [line for line in data_list if line[0] in downloaded_set]
+
+    if args.mode == 'frames':
+        result = [
+            dict(
+                frame_dir=k[0], total_frames=parse_result[k[0]][1], label=k[1])
+            for k in data_list
+        ]
+    elif args.mode == 'videos':
+        result = [dict(filename=k[0] + '.mp4', label=k[1]) for k in data_list]
+    mmengine.dump(result, args.output)
diff --git a/tools/data/hvu/generate_rawframes_filelist.sh b/tools/data/hvu/generate_rawframes_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..68c33b258817cf2db3104df6be34d3003d04ce7d
--- /dev/null
+++ b/tools/data/hvu/generate_rawframes_filelist.sh
@@ -0,0 +1,5 @@
+# to generate file list of frames
+python generate_file_list.py --input_csv ../../../data/hvu/annotations/hvu_train.csv --src_dir ../../../data/hvu/rawframes_train \
+    --output ../../../data/hvu/hvu_train.json --mode frames
+python generate_file_list.py --input_csv ../../../data/hvu/annotations/hvu_val.csv --src_dir ../../../data/hvu/rawframes_val \
+    --output ../../../data/hvu/hvu_val.json --mode frames
diff --git a/tools/data/hvu/generate_sub_file_list.py b/tools/data/hvu/generate_sub_file_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..f304c7f264b4e981c5af2335b9a690543d3e3350
--- /dev/null
+++ b/tools/data/hvu/generate_sub_file_list.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+import mmengine
+
+
+def main(annotation_file, category):
+    assert category in [
+        'action', 'attribute', 'concept', 'event', 'object', 'scene'
+    ]
+
+    data = mmengine.load(annotation_file)
+    basename = osp.basename(annotation_file)
+    dirname = osp.dirname(annotation_file)
+    basename = basename.replace('hvu', f'hvu_{category}')
+
+    target_file = osp.join(dirname, basename)
+
+    result = []
+    for item in data:
+        label = item['label']
+        if category in label:
+            item['label'] = label[category]
+            result.append(item)
+
+    mmengine.dump(data, target_file)
+
+
+if __name__ == '__main__':
+    description = 'Helper script for generating HVU per-category file list.'
+    p = argparse.ArgumentParser(description=description)
+    p.add_argument(
+        'annotation_file',
+        type=str,
+        help=('The annotation file which contains tags of all categories.'))
+    p.add_argument(
+        'category',
+        type=str,
+        choices=['action', 'attribute', 'concept', 'event', 'object', 'scene'],
+        help='The tag category that you want to generate file list for.')
+    main(**vars(p.parse_args()))
diff --git a/tools/data/hvu/generate_videos_filelist.sh b/tools/data/hvu/generate_videos_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8bcbd03a7822d9e5fa181d031b610ac2eaa400c0
--- /dev/null
+++ b/tools/data/hvu/generate_videos_filelist.sh
@@ -0,0 +1,5 @@
+# to generate file lists of videos
+python generate_file_list.py --input_csv ../../../data/hvu/annotations/hvu_train.csv --src_dir ../../../data/hvu/videos_train \
+    --output ../../../data/hvu/hvu_train_video.json --mode videos
+python generate_file_list.py --input_csv ../../../data/hvu/annotations/hvu_val.csv --src_dir ../../../data/hvu/videos_val \
+    --output ../../../data/hvu/hvu_val_video.json --mode videos
diff --git a/tools/data/hvu/label_map.json b/tools/data/hvu/label_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6525d473f307e5ad58b48d51515cfff6a120275
--- /dev/null
+++ b/tools/data/hvu/label_map.json
@@ -0,0 +1 @@
+{"action": ["abseiling", "acrobatics", "acting_in_play", "adjusting_glasses", "air_drumming", "alligator_wrestling", "alpine_skiing", "american_football", "angling", "answering_questions", "applauding", "applying_cream", "archaeological_excavation", "archery", "arguing", "arm_wrestling", "arranging_flowers", "assembling_bicycle", "assembling_computer", "attending_conference", "auctioning", "auto_racing", "backflip_human_", "baking_cookies", "ball_game", "bandaging", "barbequing", "bartending", "base_jumping", "baseball", "basketball_moves", "bathing", "bathing_dog", "baton_twirling", "battle_rope_training", "beach_soccer", "beatboxing", "bee_keeping", "belly_dancing", "bench_pressing", "bending_back", "bending_metal", "biking_through_snow", "blasting_sand", "blowdrying_hair", "blowing_bubble_gum", "blowing_glass", "blowing_leaves", "blowing_nose", "blowing_out_candles", "bmx", "boating", "bobsledding", "bodybuilding", "bodysurfing", "bookbinding", "bottling", "bouldering", "bouncing_on_bouncy_castle", "bouncing_on_trampoline", "bowling", "boxing", "braiding_hair", "breading_or_breadcrumbing", "breakdancing", "breaking_boards", "breathing_fire", "brush_painting", "brushing_hair", "brushing_teeth", "building_cabinet", "building_lego", "building_sandcastle", "building_shed", "bull_fighting", "bulldozing", "bungee_jumping", "burping", "busking", "calculating", "calf_roping", "calligraphy", "canoeing_or_kayaking", "capoeira", "capsizing", "card_game", "card_stacking", "card_throwing", "carrying_baby", "cartwheeling", "carving_ice", "carving_pumpkin", "casting_fishing_line", "catching_fish", "catching_or_throwing_baseball", "catching_or_throwing_frisbee", "catching_or_throwing_softball", "caving", "celebrating", "changing_gear_in_car", "changing_oil", "changing_wheel_not_on_bike_", "checking_tires", "cheering", "cheerleading", "chewing_gum", "chiseling_stone", "chiseling_wood", "chopping_meat", "chopping_vegetables", "chopping_wood", "choreography", "clam_digging", "clapping", "clay_pottery_making", "clean_and_jerk", "cleaning_gutters", "cleaning_pool", "cleaning_shoes", "cleaning_toilet", "cleaning_windows", "climbing", "climbing_a_rope", "climbing_ladder", "climbing_tree", "clipping_cat_claws", "coloring_in", "combing_hair", "contact_juggling", "contorting", "control", "cooking", "cooking_egg", "cooking_on_campfire", "cooking_sausages_not_on_barbeque_", "cooking_scallops", "cosplaying", "counting_money", "country_line_dancing", "cracking_back", "cracking_knuckles", "cracking_neck", "craft", "crawling_baby", "crochet", "croquet", "cross", "cross_country_cycling", "crossing_eyes", "crossing_river", "crying", "cumbia", "curling_hair", "curling_sport_", "cutting_apple", "cutting_nails", "cutting_orange", "cutting_pineapple", "cutting_the_grass", "cutting_watermelon", "cycling", "dance", "dancing_ballet", "dancing_charleston", "dancing_gangnam_style", "dancing_macarena", "deadlifting", "decorating_the_christmas_tree", "delivering_mail", "dining", "directing_traffic", "disc_dog", "disc_golfing", "diving", "diving_cliff", "docking_boat", "dodgeball", "doing_a_powerbomb", "doing_aerobics", "doing_jigsaw_puzzle", "doing_karate", "doing_kickboxing", "doing_laundry", "doing_motocross", "doing_nails", "downhill_mountain_biking", "drawing", "dribbling_basketball", "drinking", "drinking_shots", "driving_car", "driving_tractor", "drooling", "drop_kicking", "drum_corps", "drumming_fingers", "dumpster_diving", "dunking_basketball", "dyeing_eyebrows", "dyeing_hair", "eating", "eating_burger", "eating_cake", "eating_carrots", "eating_chips", "eating_doughnuts", "eating_hotdog", "eating_ice_cream", "eating_spaghetti", "eating_watermelon", "egg_hunting", "embroidering", "equitation", "exercising_with_an_exercise_ball", "extinguishing_fire", "faceplanting", "falling_off_bike", "falling_off_chair", "feeding_birds", "feeding_fish", "feeding_goats", "fencing_sport_", "fidgeting", "fight", "figure_skating", "finger_snapping", "fishing", "fixing_bicycle", "fixing_hair", "fixing_the_roof", "flint_knapping", "flipping_pancake", "fly_casting", "fly_fishing", "fly_tying", "flying_kite", "folding_clothes", "folding_napkins", "folding_paper", "folk_dance", "front_raises", "frying", "frying_vegetables", "futsal", "gambling", "geocaching", "getting_a_haircut", "getting_a_piercing", "getting_a_tattoo", "giving_or_receiving_award", "gliding", "gold_panning", "golf", "golf_chipping", "golf_driving", "golf_putting", "gospel_singing_in_church", "grappling", "grilling", "grinding_meat", "grooming_dog", "grooming_horse", "gymnastics", "gymnastics_tumbling", "hammer_throw", "hand_car_wash", "hand_washing_clothes", "harvest", "head_stand", "headbanging", "headbutting", "high_jump", "high_kick", "historical_reenactment", "hitting_a_pinata", "hitting_baseball", "hockey_stop", "holding_snake", "home_roasting_coffee", "hopscotch", "hoverboarding", "huddling", "hugging_baby", "hugging_not_baby_", "hula_hooping", "hunt_seat", "hurdling", "hurling_sport_", "ice_climbing", "ice_fishing", "ice_skating", "ice_swimming", "inflating_balloons", "inline_skating", "installing_carpet", "ironing", "ironing_hair", "javelin_throw", "jaywalking", "jetskiing", "jogging", "juggling_balls", "juggling_fire", "juggling_soccer_ball", "jumping", "jumping_bicycle", "jumping_into_pool", "jumping_jacks", "jumpstyle_dancing", "karaoke", "kicking_field_goal", "kicking_soccer_ball", "kissing", "kitesurfing", "knitting", "krumping", "land_sailing", "laughing", "lawn_mower_racing", "laying_bricks", "laying_concrete", "laying_stone", "laying_tiles", "layup_drill_in_basketball", "learning", "leatherworking", "licking", "lifting_hat", "lighting_fire", "lock_picking", "logging", "long_jump", "longboarding", "looking_at_phone", "luge", "lunge", "making_a_cake", "making_a_lemonade", "making_a_sandwich", "making_an_omelette", "making_balloon_shapes", "making_bubbles", "making_cheese", "making_horseshoes", "making_jewelry", "making_paper_aeroplanes", "making_pizza", "making_snowman", "making_sushi", "making_tea", "making_the_bed", "marching", "marching_percussion", "marriage_proposal", "massaging_back", "massaging_feet", "massaging_legs", "massaging_neck", "massaging_person_s_head", "milking_cow", "modern_dance", "moon_walking", "mopping_floor", "mosh_pit_dancing", "motorcycling", "mountain_biking", "mountain_climber_exercise_", "moving_furniture", "mowing_lawn", "mushroom_foraging", "needle_felting", "needlework", "news_anchoring", "opening_bottle_not_wine_", "opening_door", "opening_present", "opening_refrigerator", "opening_wine_bottle", "origami", "outdoor_recreation", "packing", "painting_fence", "painting_furniture", "pan_frying", "parachuting", "paragliding", "parasailing", "parkour", "passing_american_football_in_game_", "passing_american_football_not_in_game_", "passing_soccer_ball", "peeling_apples", "peeling_potatoes", "percussion", "person_collecting_garbage", "petting_animal_not_cat_", "petting_cat", "photobombing", "photocopying", "photograph", "physical_exercise", "picking_fruit", "pillow_fight", "pinching", "pirouetting", "pitch", "planing_wood", "planting_trees", "plastering", "plataform_diving", "playing_accordion", "playing_badminton", "playing_bagpipes", "playing_basketball", "playing_bass_guitar", "playing_beer_pong", "playing_blackjack", "playing_cello", "playing_chess", "playing_clarinet", "playing_congas", "playing_controller", "playing_cricket", "playing_cymbals", "playing_darts", "playing_didgeridoo", "playing_dominoes", "playing_drums", "playing_field_hockey", "playing_flute", "playing_gong", "playing_guitar", "playing_hand_clapping_games", "playing_harmonica", "playing_harp", "playing_ice_hockey", "playing_keyboard", "playing_kickball", "playing_lacrosse", "playing_laser_tag", "playing_lute", "playing_maracas", "playing_marbles", "playing_monopoly", "playing_netball", "playing_ocarina", "playing_organ", "playing_paintball", "playing_pan_pipes", "playing_piano", "playing_pinball", "playing_ping_pong", "playing_poker", "playing_polo", "playing_recorder", "playing_rubiks_cube", "playing_saxophone", "playing_scrabble", "playing_squash_or_racquetball", "playing_ten_pins", "playing_tennis", "playing_trombone", "playing_trumpet", "playing_ukulele", "playing_violin", "playing_volleyball", "playing_water_polo", "playing_with_trains", "playing_xylophone", "poking_bellybutton", "pole_vault", "polishing_forniture", "polishing_metal", "popping_balloons", "pouring_beer", "powerbocking", "preparing_pasta", "preparing_salad", "presenting_weather_forecast", "print", "public_speaking", "pull_ups", "pumping_fist", "pumping_gas", "punch", "punching_bag", "punching_person_boxing_", "purl", "push_up", "pushing_car", "pushing_cart", "pushing_wheelbarrow", "pushing_wheelchair", "putting_in_contact_lenses", "putting_on_eyeliner", "putting_on_foundation", "putting_on_lipstick", "putting_on_mascara", "putting_on_sari", "putting_on_shoes", "rafting", "raising_eyebrows", "raking_leaves", "reading", "reading_book", "reading_newspaper", "recording_music", "recreation", "recreational_fishing", "removing_curlers", "repairing_puncture", "riding_a_bike", "riding_bumper_cars", "riding_camel", "riding_elephant", "riding_mechanical_bull", "riding_mower", "riding_mule", "riding_or_walking_with_horse", "riding_scooter", "riding_snow_blower", "riding_unicycle", "ripping_paper", "river_tubing", "roasting", "roasting_marshmallows", "roasting_pig", "robot_dancing", "rock_climbing", "rock_scissors_paper", "rodeo", "roller_skating", "rollerblading", "rolling_pastry", "roof_shingle_removal", "rope_pushdown", "running", "running_on_treadmill", "sailing", "salsa_dancing", "sanding_floor", "sausage_making", "sawing_wood", "scrambling_eggs", "scrapbooking", "scrubbing_face", "scuba_diving", "separating_eggs", "setting_table", "sewing", "shaking_hands", "shaking_head", "shaping_bread_dough", "sharpening_knives", "sharpening_pencil", "shaving_head", "shaving_legs", "shearing_sheep", "shining_flashlight", "shining_shoes", "shooting", "shooting_basketball", "shooting_goal_soccer_", "shopping", "shot_put", "shoveling_snow", "shucking_oysters", "shuffling_cards", "shuffling_feet", "side_kick", "sign_language_interpreting", "singing", "sipping_cup", "sitting", "situp", "skateboarding", "ski_jumping", "skiing", "skiing_crosscountry", "skiing_mono", "skiing_slalom", "skipping_rope", "skipping_stone", "skydiving", "slacklining", "slapping", "sled_dog_racing", "sledding", "sleeping", "smashing", "smelling_feet", "smile", "smoking", "smoking_hookah", "smoking_pipe", "snatch_weight_lifting", "sneezing", "snorkeling", "snow_tubing", "snowboarding", "snowkiting", "snowmobiling", "soccer", "softball", "somersaulting", "sparring", "spelunking", "spinning_poi", "sports_training", "spray_painting", "spread_mulch", "springboard_diving", "sprint", "square_dancing", "squat", "standing", "standing_on_hands", "staring", "steer_roping", "sticking_tongue_out", "stitch", "stomping_grapes", "stone_carving", "strength_training", "stretching_arm", "stretching_leg", "sucking_lolly", "surf_fishing", "surfing_crowd", "surfing_water", "sweeping_floor", "swimming", "swimming_backstroke", "swimming_breast_stroke", "swimming_butterfly_stroke", "swimming_front_crawl", "swing_dancing", "swinging_baseball_bat", "swinging_on_something", "sword_fighting", "sword_swallowing", "table_soccer", "tackling", "tagging_graffiti", "tai_chi", "talking_on_cell_phone", "tango_dancing", "tap_dancing", "tapping_guitar", "tapping_pen", "tasting_beer", "tasting_food", "tasting_wine", "testifying", "texting", "threading_needle", "throwing_axe", "throwing_ball_not_baseball_or_american_football_", "throwing_discus", "throwing_knife", "throwing_snowballs", "throwing_tantrum", "throwing_water_balloon", "tickling", "tie_dying", "tightrope_walking", "tiptoeing", "tobogganing", "tossing_coin", "track_and_field", "trail_riding", "training_dog", "trapezing", "trimming_or_shaving_beard", "trimming_shrubs", "trimming_trees", "triple_jump", "twiddling_fingers", "tying_bow_tie", "tying_knot_not_on_a_tie_", "tying_necktie", "tying_shoe_laces", "unboxing", "underwater_diving", "unloading_truck", "using_a_microscope", "using_a_paint_roller", "using_a_power_drill", "using_a_sledge_hammer", "using_a_wrench", "using_atm", "using_bagging_machine", "using_circular_saw", "using_inhaler", "using_puppets", "using_remote_controller_not_gaming_", "using_segway", "using_the_monkey_bar", "using_the_pommel_horse", "vacuuming_floor", "visiting_the_zoo", "wading_through_mud", "wading_through_water", "waiting_in_line", "waking_up", "walking", "walking_the_dog", "walking_through_snow", "washing_dishes", "washing_feet", "washing_hair", "washing_hands", "waste", "watching_tv", "water_skiing", "water_sliding", "watering_plants", "waving_hand", "waxing_back", "waxing_chest", "waxing_eyebrows", "waxing_legs", "weaving", "weaving_basket", "weaving_fabric", "welding", "whistling", "wicker_weaving", "windsurfing", "winking", "wood_burning_art_", "worship", "wrapping_present", "wrestling", "writing", "yarn_spinning", "yawning", "yoga", "zumba"], "attribute": ["afro", "aggression", "al_dente", "angora", "art_paper", "asphalt", "azure", "bangs", "barechestedness", "beauty", "beige", "black", "black_and_white", "black_hair", "blond", "blue", "bmw", "boiling", "brass", "bricks_and_mortar", "brown", "brown_hair", "caffeine", "calm", "camouflage", "caramel_color", "cardboard", "ceramic", "citric_acid", "classic", "clay", "cleft", "cobalt_blue", "coca_cola", "complexion", "concrete", "cool", "dairy", "darkness", "daytime", "deciduous", "denim", "drama", "elder", "electric_blue", "emerald", "evergreen", "explosive_material", "floating", "fluid", "flyweight", "forward", "freezing", "fun", "glitter", "gold", "granite", "green", "happy", "human_hair_color", "hunky", "inflatable", "iron", "laminate", "layered_hair", "leather", "leisure", "lilac", "long_hair", "magenta", "maroon", "metal", "metropolis", "military", "moist", "monochrome", "multimedia", "neon", "orange", "origami_paper", "paper", "patchwork", "peach", "pigtail", "pink", "plane", "plastic", "platinum_blond", "plush", "plywood", "polka_dot", "pompadour", "purple", "rapid", "red", "red_hair", "reflection", "satin", "shade", "silk", "silver", "sweetness", "symmetry", "synthetic_rubber", "teal", "transparency_and_translucency", "turquoise", "velvet", "violet", "white", "wood", "wool", "woolen", "woven_fabric", "wrinkle", "yellow", "youth"], "concept": ["aerial_photography", "agriculture", "air_force", "air_sports", "american_food", "ancient_history", "angle", "animal_migration", "animal_source_foods", "animal_sports", "arch", "architecture", "army", "art", "artistic_gymnastics", "asian_food", "athletics", "audience", "automotive_design", "automotive_exterior", "aviation", "baked_goods", "ball_over_a_net_games", "bat_and_ball_games", "benthos", "blessing", "boardsport", "brand", "business", "cable_management", "cellular_network", "choir", "circle", "circus", "class", "classic_car", "classical_music", "clergy", "clip_art", "close_up", "collaboration", "color_guard", "combat_sport", "comfort", "comfort_food", "commodity", "community", "computer_program", "concert_band", "confectionery", "construction", "contact_sport", "convenience_food", "costume_design", "court", "court_game", "crew", "crowd", "cube", "cuisine", "currency", "cycle_sport", "cylinder", "decor", "design", "dialog_box", "diet_food", "display_advertising", "dog_breed", "dog_sports", "doubles", "dressage", "east_asian_food", "ecosystem", "electrical_network", "electricity", "electronics", "emergency", "emergency_service", "emotion", "endurance_sports", "energy", "engineering", "ensemble", "entertainment", "equestrian_sport", "erg", "european_food", "extreme_sport", "facial_expression", "family", "fashion_design", "fast_food", "fauna", "fictional_character", "field_game", "film", "finger_food", "fixed_link", "floral_design", "floristry", "font", "fried_food", "friendship", "frozen_food", "games", "geological_phenomenon", "geology", "german_food", "golf_club", "graffito", "graphic_design", "graphics", "grilled_food", "hairstyle", "handwriting", "health_care", "heart", "heat", "herd", "history", "human_behavior", "individual_sports", "indoor_games_and_sports", "industry", "infrastructure", "interaction", "interior_design", "inventory", "italian_food", "japanese_cuisine", "japanese_martial_arts", "job", "junk_food", "kite_sports", "land_vehicle", "laser", "laughter", "law_enforcement", "light_commercial_vehicle", "lighting", "line", "line_art", "local_food", "lockstitch", "logo", "love", "luxury_vehicle", "luxury_yacht", "major_appliance", "male", "management", "map", "marching_band", "marine_mammal", "martial_arts", "mass_production", "match_play", "meal", "medal_play", "medical", "medicine", "memorial", "mesh", "meteorological_phenomenon", "mid_size_car", "military_officer", "military_organization", "military_rank", "mineral", "mixture", "mode_of_transport", "modern_art", "money", "monochrome_photography", "motorsport", "music", "musical_ensemble", "natural_foods", "nature", "news", "non_sporting_group", "number", "off_road", "official", "orchestra", "organism", "pachyderm", "packaging_and_labeling", "painting", "party_supply", "pattern", "people", "performance", "performing_arts", "physical_fitness", "pint_us", "plaid", "plant_community", "plaster", "police", "pollinator", "pollution", "pop_music", "primate", "public_transport", "public_utility", "pyramid", "racquet_sport", "rapid_transit", "real_estate", "recipe", "rectangle", "religion", "research", "rock", "roller_sport", "romance", "rose_order", "seafood", "security", "selfie", "service", "shadow", "shelving", "shoal", "shooting_sport", "side_dish", "silhouette", "singles", "skin_care", "social_group", "software", "song", "spanish_cuisine", "sphere", "spiral", "spoor", "sport", "spotlight", "spring_break", "square", "star", "stick_and_ball_games", "stick_and_ball_sports", "still_life", "still_life_photography", "stock_photography", "street_art", "street_food", "striking_combat_sports", "stucco", "superfood", "surface_water_sports", "symbol", "tartan", "taste", "team", "team_sport", "technology", "telephony", "television_program", "tool", "tourism", "towed_water_sport", "tradition", "traditional_sport", "traffic", "tread", "triangle", "tribe", "troop", "underwater", "vegetarian_food", "vegetation", "video_game_software", "visual_arts", "war", "waste_containment", "water_ball_sports", "water_sport", "water_transportation", "watercraft", "weapon", "weapon_combat_sports", "website", "whole_food", "wildlife", "wind", "windsports", "winter_sport"], "event": ["800_metres", "adventure", "air_travel", "art_exhibition", "auto_show", "autumn", "award_ceremony", "banquet", "bedtime", "breakfast", "broad_jump", "brunch", "carnival", "ceremony", "championship", "christmas", "competition", "concert", "conference", "convention", "conversation", "decathlon", "demonstration", "dinner", "disaster", "evening", "exhibition", "festival", "flight", "freight_transport", "general_aviation", "graduation", "halloween", "heptathlon", "holiday", "lecture", "lunch", "manicure", "marathon", "massage", "meeting", "morning", "multi_sport_event", "news_conference", "night", "parade", "party", "photo_shoot", "picnic", "presentation", "protest", "public_event", "race", "ritual", "road_trip", "rock_concert", "safari", "seminar", "ski_cross", "speech", "spring", "summer", "sunrise_and_sunset", "supper", "tournament", "vacation", "wedding", "wedding_reception", "winter"], "object": ["abdomen", "academic_dress", "accordion", "accordionist", "acoustic_electric_guitar", "acoustic_guitar", "acrylic_paint", "action_figure", "active_undergarment", "adding_machine", "aegean_cat", "aerialist", "african_elephant", "agaric", "agaricaceae", "agaricomycetes", "agaricus", "agricultural_machinery", "agriculturist", "aioli", "air_bubble", "air_gun", "aircraft", "airliner", "alaskan_malamute", "album_cover", "alcoholic_beverage", "ale", "algae", "all_terrain_vehicle", "all_xbox_accessory", "alligator", "alloy_wheel", "alpinist", "alto_horn", "american_alligator", "american_pit_bull_terrier", "amusement_ride", "ananas", "anchor", "angle_grinder", "animal_fat", "ankle", "annual_plant", "antique", "antique_car", "appetizer", "apple", "aqua", "aqualung", "aquanaut", "aquarium", "aquatic_plant", "aquifoliaceae", "arabian_camel", "arcade_game", "archer", "arecales", "arm", "artifact", "artificial_fly", "artificial_turf", "artisan", "artwork", "athlete", "athletic_shoe", "audio_engineer", "audio_equipment", "auto_part", "automaton", "automotive_engine_part", "automotive_exhaust", "automotive_lighting", "automotive_mirror", "automotive_tire", "automotive_wheel_system", "automotive_window_part", "ax", "ax_handle", "baby_buggy", "baby_carrier", "baby_products", "baby_toys", "back", "backboard", "backhoe", "backseat", "bag", "bagel", "baggage", "bagpipes", "bait", "baker", "balance_beam", "balcony", "ball", "ballet_dancer", "ballet_skirt", "balloon", "baluster", "bandage", "banderillero", "bandoneon", "banjo", "banner", "barbell", "barber", "baritone_saxophone", "barramundi", "barrel", "barrow", "bartender", "barware", "baseball_bat", "baseball_cap", "baseball_equipment", "baseball_player", "basket", "basketball_player", "bass", "bass_drum", "bass_fiddle", "bass_guitar", "bass_oboe", "bassinet", "bassist", "bassoon", "bathing_cap", "bathroom_accessory", "bathroom_sink", "bathtub", "batter", "bayonne_ham", "bead", "beak", "beam", "bean", "beanie", "beard", "bed", "bed_frame", "bed_sheet", "bedding", "bedrock", "bee", "beef", "beef_tenderloin", "beehive", "beekeeper", "beer", "beer_cocktail", "beer_glass", "belay_device", "bell_peppers_and_chili_peppers", "bench", "berry", "beyaz_peynir", "bib", "bichon", "bicycle", "bicycle_accessory", "bicycle_chain", "bicycle_drivetrain_part", "bicycle_frame", "bicycle_handlebar", "bicycle_helmet", "bicycle_part", "bicycle_saddle", "bicycle_tire", "bicycle_wheel", "bidet", "big_cats", "bikini", "billboard", "bin", "birch", "bird", "birthday_cake", "biscuit", "black_belt", "black_cat", "blackboard", "blacksmith", "blade", "blazer", "blender", "block", "blood", "blossom", "blouse", "blue_collar_worker", "bmx_bike", "boa_constrictor", "board_game", "boas", "boat", "boats_and_boating_equipment_and_supplies", "bobsled", "bocce_ball", "bodybuilder", "bolete", "bonfire", "bongo", "bony_fish", "book", "bookcase", "boot", "bottle", "bottled_water", "boulder", "bouquet", "bow_and_arrow", "bow_tie", "bowed_string_instrument", "bowie_knife", "bowl", "bowler", "bowling_ball", "bowling_equipment", "bowling_pin", "box", "boxing_equipment", "boxing_glove", "boy", "bracelet", "brake_disk", "branch", "brass_instrument", "brassiere", "bratwurst", "bread", "bread_dough", "brick", "bricklayer", "brickwork", "bridal_clothing", "bride", "bridle", "briefs", "broccoli", "brochette", "bromeliaceae", "broom", "broth", "brush", "bubble", "bubble_gum", "bucket", "bugle", "bull", "bulldozer", "bullfighter", "bumper", "bumper_car", "bun", "bungee", "buoyancy_compensator", "bus", "businessperson", "butcher", "buttercream", "button", "button_accordion", "cab", "cabin_cruiser", "cabinet", "cabinetry", "cable", "caesar_salad", "cage", "cake", "calf", "camel", "camera", "camera_accessory", "camera_lens", "camera_operator", "camgirl", "campfire", "candle", "cannon", "canoe", "cap", "car", "car_mirror", "car_seat", "car_seat_cover", "car_tire", "car_wheel", "carbonara", "carbonated_soft_drinks", "cardboard_box", "caricaturist", "carnivoran", "carpenter", "carpet", "carriage", "carrot", "cart", "carton", "cartoon", "carving", "cash", "cash_machine", "cat", "catamaran", "cattle_like_mammal", "ceiling", "celesta", "cellist", "cello", "cellular_telephone", "center_console", "central_processing_unit", "centrepiece", "chain", "chain_link_fencing", "chain_saw", "chair", "chalk", "champagne", "champagne_stemware", "charcoal", "charcuterie", "chariot", "chassis", "cheek", "cheerleader", "cheerleading_uniform", "cheese", "cheese_pizza", "cheeseburger", "chef", "cherry", "chess_master", "chessboard", "chessman", "chest", "chest_hair", "chest_of_drawers", "chicken", "chihuahua", "child", "chin", "chip", "chocolate", "chocolate_brownie", "chocolate_cake", "chocolate_chip_cookie", "chocolate_spread", "choreographer", "christmas_decoration", "christmas_lights", "christmas_tree", "chute", "circuit", "circuit_component", "circular_saw", "circus_acrobat", "citrullus", "citrus", "city_car", "clam", "clams_oysters_mussels_and_scallops", "clarinet", "clarinet_family", "clavier", "clementine", "climber", "climbing_frame", "climbing_harness", "closet", "clothes_closet", "clothes_dryer", "clothes_hamper", "clothing", "cloud", "clown", "coat", "cobblestone", "cockapoo", "cocktail", "cocktail_dress", "cocktail_garnish", "coconut", "cod", "coffee", "coffee_bean", "coffee_cup", "coffee_table", "coin", "cola", "colander", "cold_weapon", "collage", "collar", "collection", "collie", "color_television", "colt", "colubridae", "column", "comb", "comforter", "commercial_vehicle", "common_pet_parakeet", "communication_device", "commuter", "compact_car", "compact_van", "companion_dog", "composite_material", "compound_microscope", "computer", "computer_accessory", "computer_case", "computer_component", "computer_cooling", "computer_hardware", "computer_keyboard", "concert_grand", "concertina", "condiment", "conifer", "construction_equipment", "construction_worker", "convertible", "cookie", "cookie_sheet", "cookies_and_crackers", "cookware_accessory", "cookware_and_bakeware", "cor_anglais", "coral", "coral_reef_fish", "cornet", "cosmetics", "costume", "couch", "countertop", "coverall", "cow_goat_family", "cowbarn", "cowboy", "cowboy_hat", "craftsman", "crampon", "crane", "cravat", "cream", "cream_cheese", "cricket_bat", "cricketer", "crochet_needle", "crocodile", "crocodilia", "crop", "croquet_mallet", "crossword_puzzle", "cruciferous_vegetables", "crystal", "cuatro", "cucumber", "cucumber_gourd_and_melon_family", "cucumis", "cucurbita", "cumulus", "cup", "cupboard", "curbstone", "curd", "curtain", "customer", "cut_flowers", "cutlery", "cymbal", "dairy_cattle", "dairy_cow", "dairy_product", "dance_dress", "dancer", "dashboard", "data_storage_device", "date_palm", "defenseman", "desk", "desktop_computer", "dessert", "dhow", "diaper", "diatonic_button_accordion", "digital_clock", "dining_table", "dinnerware_set", "dip", "discinaceae", "dish", "dishware", "dishwasher", "disk_jockey", "display_case", "display_device", "display_window", "distilled_beverage", "divemaster", "diver", "diving_equipment", "diving_mask", "dobok", "document", "dog", "dog_sled", "doll", "dolphin", "dome", "domestic_rabbit", "donkey", "door", "door_handle", "double_bass", "dough", "drawer", "dress", "dress_shirt", "drill", "drink", "drinker", "drinking_water", "drinkware", "drop", "drum", "drumhead", "drummer", "drumstick", "dry_suit", "dryer", "duck", "ducks_geese_and_swans", "dumbbell", "dump_truck", "duplicator", "dustpan", "ear", "earl_grey_tea", "earrings", "eating_apple", "edger", "edible_mushroom", "egg", "egg_yolk", "electric_guitar", "electric_organ", "electric_piano", "electrical_supply", "electrical_wiring", "electronic_component", "electronic_device", "electronic_keyboard", "electronic_musical_instrument", "electronic_signage", "electronics_accessory", "elephant", "elliptical_trainer", "emblem", "emergency_vehicle", "engine", "engineer", "envelope", "epee", "equestrian", "espresso", "euphonium", "executive_car", "exercise_bike", "exercise_equipment", "exercise_machine", "exhaust_system", "eye", "eye_shadow", "eyebrow", "eyelash", "eyewear", "facade", "face", "facial_hair", "family_car", "fan", "farm_machine", "farmer", "farmworker", "fashion_accessory", "fashion_model", "faucet", "feather", "feather_boa", "feature_phone", "fedora", "fence", "fencing_sword", "fencing_weapon", "fern", "ferry", "fiddle", "field_hockey_ball", "figure_skater", "figurine", "fin", "finger", "finger_paint", "fipple_flute", "fir", "fire", "firearm", "firefighter", "fireplace", "fish", "fish_feeder", "fisherman", "fishing_bait", "fishing_lure", "fishing_rod", "fishing_vessel", "fitness_professional", "flag", "flag_of_the_united_states", "flagstone", "flashlight", "flat_panel_display", "flatbread", "flautist", "flightless_bird", "flooring", "florist", "flour", "flourless_chocolate_cake", "flower", "flower_bouquet", "flowering_plant", "flowerpot", "flush_toilet", "flute", "flutist", "fly", "foal", "foil", "folk_dancer", "folk_instrument", "fondant", "food", "food_processor", "foot", "football_equipment_and_supplies", "football_helmet", "football_player", "footwear", "forehead", "fork", "forklift_truck", "formal_wear", "fortepiano", "foundation", "fountain", "fountain_pen", "free_reed_aerophone", "french_fries", "fret", "fried_egg", "fried_rice", "frost", "frozen_dessert", "fruit", "fruit_tree", "frying_pan", "fuel", "full_size_car", "fungus", "fur", "fur_clothing", "furniture", "gadget", "galliformes", "game_controller", "garbage_heap", "garbage_man", "garbage_truck", "garden_roses", "gardener", "garmon", "garnish", "gas_burner", "gas_pump", "gas_ring", "gate", "gauge", "gazebo", "gear", "gearshift", "gemstone", "german_shepherd_dog", "german_spitz", "gift", "gin_and_tonic", "giraffe", "girl", "glass", "glassblower", "glasses", "glider", "glockenspiel", "glove", "glutinous_rice", "go_kart", "goal", "goat", "goat_antelope", "goggles", "golden_retriever", "goldfish", "golf_ball", "golf_equipment", "golfcart", "golfer", "gourd", "gown", "graffiti", "grand_piano", "grape", "grapevine_family", "grass", "gravel", "great_dane", "greek_salad", "green_algae", "green_bean", "greenland_dog", "grenadier", "greyhound", "griddle", "grocer", "groom", "groundcover", "guard_dog", "guard_rail", "guitar", "guitar_accessory", "guitarist", "gymnast", "hair", "hair_accessory", "hair_coloring", "hair_dryer", "hairbrush", "hairdresser", "halter", "hamburger", "hammer", "hand", "hand_calculator", "hand_drum", "hand_glass", "handbag", "handcart", "handlebar", "handrail", "hang_glider", "hard_hat", "hardware", "hardware_accessory", "harmonica", "harp", "harvester", "hat", "hatchback", "hatchet", "havanese", "hay", "head", "head_restraint", "headgear", "headphones", "headpiece", "hearth", "heat_sink", "hedge", "heel", "helmet", "herb", "high_heeled_footwear", "highchair", "hip", "hockey_protective_equipment", "hockey_stick", "home_accessories", "home_appliance", "home_door", "home_fencing", "home_game_console_accessory", "honey_bee", "honeycomb", "hood", "hoodie", "horizontal_bar", "horn", "hors_d_oeuvre", "horse", "horse_and_buggy", "horse_harness", "horse_like_mammal", "horse_supplies", "horse_tack", "horse_trainer", "horseman", "hospital_bed", "hot_air_balloon", "hot_pot", "hot_tub", "household_cleaning_supply", "houseplant", "hub_gear", "hubcap", "human", "human_body", "human_leg", "hunting_dog", "hurdle", "hybrid_bicycle", "ice", "ice_cream", "ice_cream_cone", "ice_lolly", "ice_skate", "iceberg", "icing", "illustration", "indian_elephant", "infant", "infant_bed", "infantry", "inflatable_boat", "ingredient", "input_device", "insect", "invertebrate", "io_card", "iris", "ivy", "jack_o_lantern", "jacket", "jasmine_rice", "javelin", "jaw", "jeans", "jersey", "jewellery", "jigsaw_puzzle", "jockey", "joint", "jointer", "journalist", "joystick", "juggler", "juice", "jungle_gym", "kayak", "kettle", "keyboard_instrument", "keyboard_player", "kielbasa", "kilt", "kisser", "kitchen_appliance", "kitchen_knife", "kite", "kitten", "knackwurst", "knee", "knife", "knit_cap", "knitting_needle", "knot", "koi", "konghou", "lab_coat", "label", "labrador_retriever", "lace", "lacrosse_stick", "lacrosse_training_equipment", "ladder", "lamp", "laptop", "lasso", "latch", "lathe", "laundry", "lawn", "lcd_tv", "lead_pencil", "leaf", "leaf_vegetable", "leash", "led_backlit_lcd_display", "leggings", "lemon", "lemonade", "lens", "leotard", "lettuce", "lever", "ligament", "light_bulb", "light_fixture", "light_microscope", "lighter", "lighting_accessory", "lineman", "linens", "lingerie", "lip", "lip_gloss", "lipstick", "liquor_shelf", "litter", "little_black_dress", "livestock", "lobe", "lock", "locker", "locomotive", "loggerhead", "lollipop", "longboard", "loom", "lotion", "loudspeaker", "lovebird", "loveseat", "lumber", "lute", "macaw", "machine", "machine_tool", "magazine", "maillot", "makeup", "mallet", "maltese", "mammal", "man", "mandarin_orange", "mandolin", "mane", "maraca", "marcher", "mare", "marimba", "marine_invertebrates", "marines", "mask", "mason_jar", "mast", "mat", "matador", "matsutake", "mattress", "mattress_pad", "mcintosh", "measuring_instrument", "meat", "meat_grinder", "mechanic", "media_player", "medical_assistant", "medical_equipment", "medical_glove", "medicine_ball", "melee_weapon", "mellophone", "melon", "membrane_winged_insect", "mender", "metal_lathe", "metalsmith", "microcontroller", "microphone", "microscope", "microwave_oven", "miler", "military_camouflage", "military_person", "military_uniform", "milk", "miniature_poodle", "minibus", "minivan", "mirror", "mixer", "mixing_bowl", "mixing_console", "mobile_device", "mobile_phone", "model", "monument", "moped", "moss", "motherboard", "motocross_bike", "motor_scooter", "motor_ship", "motor_vehicle", "motorboat", "motorcycle", "motorcycle_accessories", "motorcyclist", "motorized_wheelchair", "mountain_bike", "mountaineer", "moustache", "mouth", "mower", "mud", "mug", "mule", "mural", "muscle", "musher", "mushroom", "musical_instrument", "musical_instrument_accessory", "musical_keyboard", "musician", "musket", "nail", "nail_polish", "neck", "necklace", "necktie", "needle", "neon_lamp", "neon_sign", "net", "newscaster", "newspaper", "nib", "nightwear", "non_alcoholic_beverage", "non_commissioned_officer", "non_skin_percussion_instrument", "noodle", "nose", "numeric_keypad", "oars", "oboist", "ocarina", "off_road_vehicle", "office_equipment", "office_supplies", "oil_paint", "open_wheel_car", "optical_instrument", "orator", "organ", "organ_pipe", "organist", "outdoor_furniture", "outdoor_grill", "outdoor_play_equipment", "outdoor_power_equipment", "outdoor_shoe", "outdoor_structure", "outerwear", "output_device", "overhead_power_line", "ox", "oxygen_mask", "oyster", "oyster_mushroom", "oyster_shell", "pack_animal", "paddle", "padlock", "paintball_equipment", "paintball_gun", "palm_tree", "pan", "panelist", "pantyhose", "paper_product", "paper_towel", "parachute", "parakeet", "parallel_bars", "park_bench", "parquet", "parrot", "parsley", "passenger", "passenger_ship", "pasta", "pastry", "patient", "paving", "paw", "pawn", "pearl", "pebble", "pedestrian", "peel", "pen", "pencil", "pencil_sharpener", "pepperoni", "percussion_accessory", "percussion_instrument", "percussionist", "performance_car", "perico", "personal_computer", "personal_digital_assistant", "personal_flotation_device", "personal_protective_equipment", "petal", "pezizales", "photocopier", "physical_therapist", "physician", "pianet", "pianist", "piano", "piano_keyboard", "picador", "picket_fence", "pickup_truck", "picnic_boat", "pig", "pig_like_mammal", "pigeon", "pigeons_and_doves", "pillow", "pilot_boat", "pinata", "pinball_machine", "pine", "pine_family", "pineapple", "pinscher", "pint_glass", "pipe", "pizza", "pizza_cheese", "plant", "plant_stem", "plastic_bag", "plate", "platter", "play_vehicle", "player", "playground_slide", "playpen", "playstation_3_accessory", "playstation_accessory", "pliers", "plimsoll", "plucked_string_instruments", "plumbing", "plumbing_fixture", "pocket", "pointer", "pole", "police_officer", "polo_mallet", "polo_pony", "polo_shirt", "pomeranian", "pommel_horse", "pontoon", "pony", "poodle", "porcelain", "portable_communications_device", "portable_media_player", "portrait", "poster", "potato", "potato_and_tomato_genus", "pothole", "powdered_sugar", "power_drill", "power_mower", "power_shovel", "printer", "produce", "professional_golfer", "propeller", "protective_equipment_in_gridiron_football", "protective_gear_in_sports", "pug", "pumpkin", "pungsan_dog", "puppy", "putter", "puzzle", "queen", "quill", "rabbit", "race_car", "racer", "racing_bicycle", "racket", "radial", "random_orbital_sander", "ranged_weapon", "rear_view_mirror", "recycling_bin", "red_carpet", "red_meat", "red_wine", "redhead", "reed_instrument", "refrigerator", "rein", "remote_control", "reptile", "researcher", "retaining_wall", "retriever", "ribbon", "rice", "rifle", "rim", "ring", "road_bicycle", "roast_beef", "robot", "rock_climbing_equipment", "rock_star", "rodent", "roller_blades", "roller_skates", "rolling_pin", "roof", "root", "root_vegetable", "rope", "rose", "rose_family", "rotisserie", "royal_icing", "rubber_boot", "rubble", "runner", "running_shoe", "saddle", "safe", "safety_belt", "safety_bicycle", "safety_glove", "sail", "sailboat", "sailing_ship", "salad", "salmon", "samoyed", "sand", "sand_wedge", "sandal", "sandbox", "sandwich", "sapsali", "sari", "sarong", "sash_window", "sashimi", "saucer", "sauces", "sausage", "saw", "saxhorn", "saxophone", "saxophonist", "scaffolding", "scale_model", "scaled_reptile", "scanner", "scarf", "schipperke", "schnoodle", "schooner", "scientific_instrument", "scissors", "scooter", "scoreboard", "scow", "scrap", "screen", "scuba_diver", "sculptor", "sculpture", "sea_ice", "sea_kayak", "sea_turtle", "seabird", "seaplane", "seat_belt", "seaweed", "sedan", "seed", "segway", "senior_citizen", "serger", "serpent", "serveware", "sewing_machine", "sewing_machine_needle", "shaving_cream", "shed", "sheep", "shelf", "shih_tzu", "ship", "shipwreck", "shirt", "shoe", "shopkeeper", "shopping_basket", "shopping_cart", "shorts", "shoulder", "shovel", "shower_curtain", "shrimp", "shrub", "siberian_husky", "sicilian_pizza", "sideboard", "siding", "sign", "singer", "singlet", "sink", "skateboard", "skateboarder", "skateboarding_equipment_and_supplies", "sketch", "skewer", "ski", "ski_binding", "ski_equipment", "ski_pole", "skidder", "skiff", "skin", "skin_head_percussion_instrument", "skirt", "slate_roof", "sled", "sled_dog", "sleeper", "sleeve", "sloop", "slot", "slot_machine", "small_appliance", "smartphone", "smoke", "snack", "snake", "snare_drum", "sneakers", "snorkel", "snout", "snow_thrower", "snowboard", "snowmobile", "snowplow", "snowshoe", "snowsuit", "soccer_ball", "soccer_player", "sock", "soft_drink", "soil", "soup", "space_bar", "spaghetti", "spaniel", "spatula", "speaker", "speedometer", "speleothem", "spice", "spin_dryer", "spinach", "spinach_salad", "spindle", "spinet", "spinning_wheel", "spitz", "spoke", "spokesperson", "spoon", "sport_kite", "sport_utility_vehicle", "sports_car", "sports_equipment", "sports_uniform", "sportswear", "spring_greens", "sprinkler", "spruce", "spume", "square_dancer", "squash", "stairs", "stalagmite", "stall", "stallion", "standard_poodle", "statue", "steak", "steam_iron", "steamed_rice", "steel", "steel_drum", "steering_part", "steering_wheel", "stemware", "stew", "stick", "stock_car", "stock_dove", "stocking", "stomach", "stone_wall", "stony_coral", "storage_basket", "stout", "stove_and_oven", "strainer", "straw", "streamer_fly", "street_light", "string_instrument", "string_instrument_accessory", "stubble", "student", "stuffed_toy", "stuffing", "stunt_performer", "subcompact_car", "subwoofer", "sugar_cake", "sugar_paste", "suit", "sun", "sun_hat", "sunbather", "sunglasses", "sunlight", "supercar", "superhero", "surfboard", "surfing_equipment_and_supplies", "sushi", "swab", "swan", "sweater", "sweet_grass", "swimmer", "swimsuit_bottom", "swimwear", "swing", "switch", "synthesizer", "t_shirt", "tabby_cat", "table", "table_knife", "table_tennis_racket", "tablecloth", "tabletop_game", "tableware", "tachometer", "taglierini", "tail", "tall_ship", "tank", "tarpaulin", "tattoo", "tea", "teacher", "teapot", "teddy_bear", "telephone", "television_presenter", "television_reporter", "television_set", "tennis_equipment_and_supplies", "tennis_player", "tennis_pro", "tennis_racket", "tenor_saxophonist", "tent", "terrestrial_animal", "terrestrial_plant", "terrier", "text", "textile", "theater_curtain", "therapist", "thigh", "thorns_spines_and_prickles", "thread", "thumb", "tights", "tile", "tiple", "tire", "toast", "toddler", "toe", "toilet", "toilet_tissue", "tom_tom_drum", "tomahawk", "tomato", "tongue", "tooth", "toothbrush", "top", "toppings", "torch", "torso", "torte", "tower", "toy", "toy_box", "toy_poodle", "track_spikes", "tractor", "traffic_cop", "traffic_light", "trail_bike", "trailer", "trailer_truck", "train", "trampoline", "trapeze", "travel_trailer", "tree", "tricycle", "trigger", "trombone", "trousers", "trowel", "truck", "trumpet", "trumpeter", "tub", "tudung", "tusk", "tuxedo", "twig", "uke", "umbrella", "undergarment", "underpants", "uneven_parallel_bars", "unicycle", "unicyclist", "uniform", "urinal", "vacuum_cleaner", "van", "vascular_plant", "vase", "vaulter", "vegetable", "vehicle", "vehicle_brake", "vehicle_door", "vehicle_registration_plate", "venison", "vertebrate", "vibraphone", "video_game_console", "vigil_light", "vintage_car", "vintage_clothing", "violin", "violin_family", "violinist", "violist", "vitis", "vizsla", "volleyball_net", "volleyball_player", "wagon", "waist", "waiter", "walk_behind_mower", "walker", "walking_shoe", "wall", "wardrobe", "washbasin", "washing_machine", "waste_container", "watch", "water", "water_bird", "water_feature", "water_polo_cap", "water_ski", "watercolor_paint", "waterfowl", "watering_can", "watermelon", "wave", "wedding_ceremony_supply", "wedding_dress", "wedding_ring", "weightlifter", "weights", "welder", "west_highland_white_terrier", "wetsuit", "whaler", "whales_dolphins_and_porpoises", "wheat_beer", "wheel", "wheelchair", "whipped_cream", "whippet", "whisk", "whiskers", "whisky", "whistle", "white_coat", "white_collar_worker", "white_rice", "wicker_basket", "wicket", "wig", "wildflower", "wildlife_biologist", "wind_instrument", "wind_wave", "window", "window_blind", "window_covering", "window_screen", "window_treatment", "windshield", "windshield_wiper", "wine", "wine_glass", "wing", "winter_squash", "wiper", "wire", "wire_fencing", "wok", "woman", "wood_burning_stove", "wood_stain", "woodwind_instrument", "woody_plant", "workman", "wrench", "wrestler", "wrestling_mat", "wrestling_singlet", "wrist", "xylophone", "yacht", "yakitori", "yolk"], "scene": ["aeolian_landform", "aisle", "alley", "amusement_park", "animal_shelter", "apartment", "apiary", "archaeological_site", "arena", "arroyo", "attic", "auditorium", "automobile_repair_shop", "backyard", "badlands", "bakery", "ballpark", "ballroom", "bank", "bar", "barbershop", "barn", "baseball_field", "baseball_positions", "basement", "basketball_court", "bathroom", "batting_cage", "bay", "bayou", "bazaar", "beach", "beauty_salon", "bedroom", "boardwalk", "body_of_water", "boutique", "bowling_alley", "boxing_ring", "bridge", "building", "bullring", "butcher_shop", "canyon", "cape", "carport", "casino", "cave", "channel", "chapel", "cityscape", "cliff", "clinic", "coast", "coastal_and_oceanic_landforms", "cockpit", "cocktail_lounge", "concert_hall", "condominium", "conference_hall", "coral_reef", "courtyard", "creek", "day_nursery", "deck", "desert", "dining_room", "dock", "downtown", "dune", "ecoregion", "escarpment", "estate", "factory", "fair", "farm", "fault", "field", "field_lacrosse", "fire_department", "fish_pond", "floor", "fluvial_landforms_of_streams", "football_stadium", "forest", "formation", "foundry", "function_hall", "garage", "garden", "garden_buildings", "glacial_lake", "golf_course", "grassland", "grocery_store", "grove", "gym", "hall", "harbor", "haze", "headland", "highland", "hill", "historic_site", "home", "horizon", "hospital", "hot_spring", "hotel", "hotel_room", "house", "hut", "ice_hockey_position", "ice_hockey_rink", "ice_rink", "inlet", "intersection", "kindergarten", "kitchen", "laboratory", "lake", "land_lot", "landmark", "landscape", "lane", "lecture_room", "leisure_centre", "littoral", "living_room", "log_cabin", "marina", "market", "marsh", "massif", "meadow", "meander", "metropolitan_area", "mountain", "mountain_pass", "mountain_range", "mountainous_landforms", "music_venue", "musical_theatre", "national_park", "natural_resources", "nature_reserve", "neighbourhood", "nightclub", "office", "opera", "outcrop", "paddy_field", "palace", "panorama", "park", "parking", "pasture", "path", "patio", "pavilion", "pedestrian_crossing", "performing_arts_center", "piste", "place_of_worship", "plain", "plateau", "playground", "plaza", "pond", "port", "property", "public_space", "race_track", "ranch", "reef", "religious_institute", "reservoir", "residential_area", "resort", "restaurant", "restroom", "retail", "ridge", "riparian_zone", "river", "riverbed", "road", "road_highway", "room", "rural_area", "sandbank", "sandbar", "school", "sea", "seashore", "seaside", "shack", "shooting_range", "shopping_mall", "shore", "sidewalk", "ski_slope", "sky", "skyline", "skyscraper", "snow_covered_landscape", "sport_venue", "stable", "stadium", "stage", "strand", "stream", "stream_bed", "street", "suburb", "summit", "supermarket", "swamp", "swimming_pool", "tavern", "television_room", "tennis_camp", "tennis_court", "terrain", "theatre", "toolroom", "tourist_attraction", "tower_block", "town", "town_square", "track", "tropical_beach", "tropics", "tunnel", "urban_area", "urban_design", "valley", "village", "walkway", "warehouse", "watercourse", "waterfall", "waterway", "wetland", "wildlife_region", "workshop", "yard", "zoo"]}
diff --git a/tools/data/hvu/parse_tag_list.py b/tools/data/hvu/parse_tag_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..af5c8667fe5433470afc96fa91a9ad0c0586b4f4
--- /dev/null
+++ b/tools/data/hvu/parse_tag_list.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine
+
+tag_list = '../../../data/hvu/annotations/hvu_categories.csv'
+
+lines = open(tag_list).readlines()
+lines = [x.strip().split(',') for x in lines[1:]]
+tag_categories = {}
+for line in lines:
+    tag, category = line
+    tag_categories.setdefault(category, []).append(tag)
+
+for k in tag_categories:
+    tag_categories[k].sort()
+
+mmengine.dump(tag_categories, 'hvu_tags.json')
diff --git a/tools/data/jester/README.md b/tools/data/jester/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8a05dc72e34e7c5e59536b5cc022ee0efc199a9
--- /dev/null
+++ b/tools/data/jester/README.md
@@ -0,0 +1,143 @@
+# Preparing Jester
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@InProceedings{Materzynska_2019_ICCV,
+  author = {Materzynska, Joanna and Berger, Guillaume and Bax, Ingo and Memisevic, Roland},
+  title = {The Jester Dataset: A Large-Scale Video Dataset of Human Gestures},
+  booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops},
+  month = {Oct},
+  year = {2019}
+}
+```
+
+For basic dataset information, you can refer to the dataset [website](https://developer.qualcomm.com/software/ai-datasets/jester).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/jester/`.
+
+## Step 1. Prepare Annotations
+
+First of all, you have to sign in and download annotations to `$MMACTION2/data/jester/annotations` on the official [website](https://developer.qualcomm.com/software/ai-datasets/jester).
+
+## Step 2. Prepare RGB Frames
+
+Since the [jester website](https://developer.qualcomm.com/software/ai-datasets/jester) doesn't provide the original video data and only extracted RGB frames are available, you have to directly download RGB frames from [jester website](https://developer.qualcomm.com/software/ai-datasets/jester).
+
+You can download all RGB frame parts on [jester website](https://developer.qualcomm.com/software/ai-datasets/jester) to `$MMACTION2/data/jester/` and use the following command to extract.
+
+```shell
+cd $MMACTION2/data/jester/
+cat 20bn-jester-v1-?? | tar zx
+cd $MMACTION2/tools/data/jester/
+```
+
+For users who only want to use RGB frames, you can skip to step 5 to generate file lists in the format of rawframes. Since the prefix of official JPGs is "%05d.jpg" (e.g., "00001.jpg"),
+we add `"filename_tmpl='{:05}.jpg'"` to the dict of `data.train`, `data.val` and `data.test` in the config files related with jester like this:
+
+```
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        filename_tmpl='{:05}.jpg',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        filename_tmpl='{:05}.jpg',
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        filename_tmpl='{:05}.jpg',
+        pipeline=test_pipeline))
+```
+
+## Step 3. Extract Flow
+
+This part is **optional** if you only want to use RGB frames.
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
+
+You can run the following script to soft link SSD.
+
+```shell
+# execute these two line (Assume the SSD is mounted at "/mnt/SSD/")
+mkdir /mnt/SSD/jester_extracted/
+ln -s /mnt/SSD/jester_extracted/ ../../../data/jester/rawframes
+```
+
+Then, you can run the following script to extract optical flow based on RGB frames.
+
+```shell
+cd $MMACTION2/tools/data/jester/
+bash extract_flow.sh
+```
+
+## Step 4. Encode Videos
+
+This part is **optional** if you only want to use RGB frames.
+
+You can run the following script to encode videos.
+
+```shell
+cd $MMACTION2/tools/data/jester/
+bash encode_videos.sh
+```
+
+## Step 5. Generate File List
+
+You can run the follow script to generate file list in the format of rawframes and videos.
+
+```shell
+cd $MMACTION2/tools/data/jester/
+bash generate_{rawframes, videos}_filelist.sh
+```
+
+## Step 5. Check Directory Structure
+
+After the whole data process for Jester preparation,
+you will get the rawframes (RGB + Flow), and annotation files for Jester.
+
+In the context of the whole project (for Jester only), the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── jester
+│   │   ├── jester_{train,val}_list_rawframes.txt
+│   │   ├── jester_{train,val}_list_videos.txt
+│   │   ├── annotations
+│   |   ├── videos
+│   |   |   ├── 1.mp4
+│   |   |   ├── 2.mp4
+│   |   |   ├──...
+│   |   ├── rawframes
+│   |   |   ├── 1
+│   |   |   |   ├── 00001.jpg
+│   |   |   |   ├── 00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_x_00001.jpg
+│   |   |   |   ├── flow_x_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_y_00001.jpg
+│   |   |   |   ├── flow_y_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   ├── 2
+│   |   |   ├── ...
+
+```
+
+For training and evaluating on Jester, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/jester/README_zh-CN.md b/tools/data/jester/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..858e6abe204a4163cca42514021d63f973fbf97c
--- /dev/null
+++ b/tools/data/jester/README_zh-CN.md
@@ -0,0 +1,143 @@
+# 准备 Jester
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@InProceedings{Materzynska_2019_ICCV,
+  author = {Materzynska, Joanna and Berger, Guillaume and Bax, Ingo and Memisevic, Roland},
+  title = {The Jester Dataset: A Large-Scale Video Dataset of Human Gestures},
+  booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops},
+  month = {Oct},
+  year = {2019}
+}
+```
+
+用户可以参照数据集 [官网](https://developer.qualcomm.com/software/ai-datasets/jester)，获取数据集相关的基本信息。
+在准备数据集前，请确保命令行当前路径为 `$MMACTION2/tools/data/jester/`。
+
+## 步骤 1. 下载标注文件
+
+首先，用户需要在 [官网](https://developer.qualcomm.com/software/ai-datasets/jester) 完成注册，才能下载标注文件。下载好的标注文件需要放在 `$MMACTION2/data/jester/annotations` 文件夹下。
+
+## 步骤 2. 准备 RGB 帧
+
+[jester 官网](https://developer.qualcomm.com/software/ai-datasets/jester) 并未提供原始视频文件，只提供了对原视频文件进行抽取得到的 RGB 帧，用户可在 [jester 官网](https://developer.qualcomm.com/software/ai-datasets/jester) 直接下载。
+
+将下载好的压缩文件放在 `$MMACTION2/data/jester/` 文件夹下，并使用以下脚本进行解压。
+
+```shell
+cd $MMACTION2/data/jester/
+cat 20bn-jester-v1-?? | tar zx
+cd $MMACTION2/tools/data/jester/
+```
+
+如果用户只想使用 RGB 帧，则可以跳过中间步骤至步骤 5 以直接生成视频帧的文件列表。
+由于官网的 JPG 文件名形如 "%05d.jpg" （比如，"00001.jpg"），需要在配置文件的 `data.train`, `data.val` 和 `data.test` 处添加 `"filename_tmpl='{:05}.jpg'"` 代码，以修改文件名模板。
+
+```python
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        filename_tmpl='{:05}.jpg',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        filename_tmpl='{:05}.jpg',
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        filename_tmpl='{:05}.jpg',
+        pipeline=test_pipeline))
+```
+
+## 步骤 3. 抽取光流
+
+如果用户只想使用 RGB 帧训练，则该部分是 **可选项**。
+
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+如果拥有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 中。
+
+可以运行以下命令为 SSD 建立软链接。
+
+```shell
+# 执行这两行进行抽取（假设 SSD 挂载在 "/mnt/SSD/"）
+mkdir /mnt/SSD/jester_extracted/
+ln -s /mnt/SSD/jester_extracted/ ../../../data/jester/rawframes
+```
+
+如果想抽取光流，则可以运行以下脚本从 RGB 帧中抽取出光流。
+
+```shell
+cd $MMACTION2/tools/data/jester/
+bash extract_flow.sh
+```
+
+## 步骤 4: 编码视频
+
+如果用户只想使用 RGB 帧训练，则该部分是 **可选项**。
+
+用户可以运行以下命令进行视频编码。
+
+```shell
+cd $MMACTION2/tools/data/jester/
+bash encode_videos.sh
+```
+
+## 步骤 5. 生成文件列表
+
+用户可以通过运行以下命令生成帧和视频格式的文件列表。
+
+```shell
+cd $MMACTION2/tools/data/jester/
+bash generate_{rawframes, videos}_filelist.sh
+```
+
+## 步骤 6. 检查文件夹结构
+
+在完成所有 Jester 数据集准备流程后，
+用户可以获得对应的 RGB + 光流文件，视频文件以及标注文件。
+
+在整个 MMAction2 文件夹下，Jester 的文件结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── jester
+│   │   ├── jester_{train,val}_list_rawframes.txt
+│   │   ├── jester_{train,val}_list_videos.txt
+│   │   ├── annotations
+│   |   ├── videos
+│   |   |   ├── 1.mp4
+│   |   |   ├── 2.mp4
+│   |   |   ├──...
+│   |   ├── rawframes
+│   |   |   ├── 1
+│   |   |   |   ├── 00001.jpg
+│   |   |   |   ├── 00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_x_00001.jpg
+│   |   |   |   ├── flow_x_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_y_00001.jpg
+│   |   |   |   ├── flow_y_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   ├── 2
+│   |   |   ├── ...
+
+```
+
+关于对 jester 进行训练和验证，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/jester/encode_videos.sh b/tools/data/jester/encode_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b746a3d05792381e53eb07f3dfd6b2a3314716b2
--- /dev/null
+++ b/tools/data/jester/encode_videos.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_videos.py ../../data/jester/rawframes/ ../../data/jester/videos/ --fps 12 --level 1 --start-idx 1 --filename-tmpl '%05d'
+echo "Encode videos"
+
+cd jester/
diff --git a/tools/data/jester/extract_flow.sh b/tools/data/jester/extract_flow.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1e81a84dfed9d56657c974b137aa283957ce0caa
--- /dev/null
+++ b/tools/data/jester/extract_flow.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/jester/rawframes/ ../../data/jester/rawframes/ --task flow --level 1 --flow-type tvl1 --input-frames
+echo "Flow (tv-l1) Generated"
+cd jester/
diff --git a/tools/data/jester/generate_rawframes_filelist.sh b/tools/data/jester/generate_rawframes_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f4b8207205a54deafc07e0097e74e32b8eeecc3d
--- /dev/null
+++ b/tools/data/jester/generate_rawframes_filelist.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py jester data/jester/rawframes/ --rgb-prefix '0' --num-split 1 --level 1 --subset train --format rawframes --shuffle
+PYTHONPATH=. python tools/data/build_file_list.py jester data/jester/rawframes/ --rgb-prefix '0' --num-split 1 --level 1 --subset val --format rawframes --shuffle
+echo "Filelist for rawframes generated."
+
+cd tools/data/jester/
diff --git a/tools/data/jester/generate_videos_filelist.sh b/tools/data/jester/generate_videos_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..71f145222a267c82cadfdd5b71eb4f6fdd3992f9
--- /dev/null
+++ b/tools/data/jester/generate_videos_filelist.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py jester data/jester/videos/ --num-split 1 --level 1 --subset train --format videos --shuffle
+PYTHONPATH=. python tools/data/build_file_list.py jester data/jester/videos/ --num-split 1 --level 1 --subset val --format videos --shuffle
+echo "Filelist for videos generated."
+
+cd tools/data/jester/
diff --git a/tools/data/jester/label_map.txt b/tools/data/jester/label_map.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b3b080fcae5667b5964933ee424cd2fd7f0d1f27
--- /dev/null
+++ b/tools/data/jester/label_map.txt
@@ -0,0 +1,27 @@
+Swiping Left
+Swiping Right
+Swiping Down
+Swiping Up
+Pushing Hand Away
+Pulling Hand In
+Sliding Two Fingers Left
+Sliding Two Fingers Right
+Sliding Two Fingers Down
+Sliding Two Fingers Up
+Pushing Two Fingers Away
+Pulling Two Fingers In
+Rolling Hand Forward
+Rolling Hand Backward
+Turning Hand Clockwise
+Turning Hand Counterclockwise
+Zooming In With Full Hand
+Zooming Out With Full Hand
+Zooming In With Two Fingers
+Zooming Out With Two Fingers
+Thumb Up
+Thumb Down
+Shaking Hand
+Stop Sign
+Drumming Fingers
+No gesture
+Doing other things
diff --git a/tools/data/jhmdb/README.md b/tools/data/jhmdb/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4450087c7a1177ec1311602f883a0fe9e1a8ad4f
--- /dev/null
+++ b/tools/data/jhmdb/README.md
@@ -0,0 +1,101 @@
+# Preparing JHMDB
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{Jhuang:ICCV:2013,
+    title = {Towards understanding action recognition},
+    author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
+    booktitle = {International Conf. on Computer Vision (ICCV)},
+    month = Dec,
+    pages = {3192-3199},
+    year = {2013}
+}
+```
+
+For basic dataset information, you can refer to the dataset [website](http://jhmdb.is.tue.mpg.de/).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/jhmdb/`.
+
+## Download and Extract
+
+You can download the RGB frames, optical flow and ground truth annotations from [google drive](https://drive.google.com/drive/folders/1BvGywlAGrACEqRyfYbz3wzlVV3cDFkct).
+The data are provided from [MOC](https://github.com/MCG-NJU/MOC-Detector/blob/master/readme/Dataset.md), which is adapted from [act-detector](https://github.com/vkalogeiton/caffe/tree/act-detector).
+
+After downloading the `JHMDB.tar.gz` file and put it in `$MMACTION2/tools/data/jhmdb/`, you can run the following command to extract.
+
+```shell
+tar -zxvf JHMDB.tar.gz
+```
+
+If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
+
+You can run the following script to soft link SSD.
+
+```shell
+# execute these two line (Assume the SSD is mounted at "/mnt/SSD/")
+mkdir /mnt/SSD/JHMDB/
+ln -s /mnt/SSD/JHMDB/ ../../../data/jhmdb
+```
+
+## Check Directory Structure
+
+After extracting, you will get the `FlowBrox04` directory, `Frames` directory and `JHMDB-GT.pkl` for JHMDB.
+
+In the context of the whole project (for JHMDB only), the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── jhmdb
+│   |   ├── FlowBrox04
+│   |   |   ├── brush_hair
+│   |   |   |   ├── April_09_brush_hair_u_nm_np1_ba_goo_0
+│   |   |   |   |   ├── 00001.jpg
+│   |   |   |   |   ├── 00002.jpg
+│   |   |   |   |   ├── ...
+│   |   |   |   |   ├── 00039.jpg
+│   |   |   |   |   ├── 00040.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── Trannydude___Brushing_SyntheticHair___OhNOES!__those_fukin_knots!_brush_hair_u_nm_np1_fr_goo_2
+│   |   |   ├── ...
+│   |   |   ├── wave
+│   |   |   |   ├── 21_wave_u_nm_np1_fr_goo_5
+│   |   |   |   ├── ...
+│   |   |   |   ├── Wie_man_winkt!!_wave_u_cm_np1_fr_med_0
+│   |   ├── Frames
+│   |   |   ├── brush_hair
+│   |   |   |   ├── April_09_brush_hair_u_nm_np1_ba_goo_0
+│   |   |   |   |   ├── 00001.png
+│   |   |   |   |   ├── 00002.png
+│   |   |   |   |   ├── ...
+│   |   |   |   |   ├── 00039.png
+│   |   |   |   |   ├── 00040.png
+│   |   |   |   ├── ...
+│   |   |   |   ├── Trannydude___Brushing_SyntheticHair___OhNOES!__those_fukin_knots!_brush_hair_u_nm_np1_fr_goo_2
+│   |   |   ├── ...
+│   |   |   ├── wave
+│   |   |   |   ├── 21_wave_u_nm_np1_fr_goo_5
+│   |   |   |   ├── ...
+│   |   |   |   ├── Wie_man_winkt!!_wave_u_cm_np1_fr_med_0
+│   |   ├── JHMDB-GT.pkl
+
+```
+
+:::{note}
+The `JHMDB-GT.pkl` exists as a cache, it contains 6 items as follows:
+
+1. `labels` (list): List of the 21 labels.
+2. `gttubes` (dict): Dictionary that contains the ground truth tubes for each video.
+   A **gttube** is dictionary that associates with each index of label and a list of tubes.
+   A **tube** is a numpy array with `nframes` rows and 5 columns, each col is in format like `<frame index> <x1> <y1> <x2> <y2>`.
+3. `nframes` (dict): Dictionary that contains the number of frames for each video, like `'walk/Panic_in_the_Streets_walk_u_cm_np1_ba_med_5': 16`.
+4. `train_videos` (list): A list with `nsplits=1` elements, each one containing the list of training videos.
+5. `test_videos` (list): A list with `nsplits=1` elements, each one containing the list of testing videos.
+6. `resolution` (dict): Dictionary that outputs a tuple (h,w) of the resolution for each video, like `'pour/Bartender_School_Students_Practice_pour_u_cm_np1_fr_med_1': (240, 320)`.
+
+:::
diff --git a/tools/data/jhmdb/README_zh-CN.md b/tools/data/jhmdb/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ba6bffea15349b6ff6a39585c05b2dc7342abe3
--- /dev/null
+++ b/tools/data/jhmdb/README_zh-CN.md
@@ -0,0 +1,98 @@
+# 准备 JHMDB
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{Jhuang:ICCV:2013,
+    title = {Towards understanding action recognition},
+    author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
+    booktitle = {International Conf. on Computer Vision (ICCV)},
+    month = Dec,
+    pages = {3192-3199},
+    year = {2013}
+}
+```
+
+用户可参考该数据集的 [官网](http://jhmdb.is.tue.mpg.de/)，以获取数据集相关的基本信息。
+在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/jhmdb/`。
+
+## 下载和解压
+
+用户可以从 [这里](https://drive.google.com/drive/folders/1BvGywlAGrACEqRyfYbz3wzlVV3cDFkct) 下载 RGB 帧，光流和真实标签文件。
+该数据由 [MOC](https://github.com/MCG-NJU/MOC-Detector/blob/master/readme/Dataset.md) 代码库提供，参考自 [act-detector](https://github.com/vkalogeiton/caffe/tree/act-detector)。
+
+用户在下载 `JHMDB.tar.gz` 文件后，需将其放置在 `$MMACTION2/tools/data/jhmdb/` 目录下，并使用以下指令进行解压：
+
+```shell
+tar -zxvf JHMDB.tar.gz
+```
+
+如果拥有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 中。
+
+可以运行以下命令为 SSD 建立软链接。
+
+```shell
+# 执行这两行进行抽取（假设 SSD 挂载在 "/mnt/SSD/"）
+mkdir /mnt/SSD/JHMDB/
+ln -s /mnt/SSD/JHMDB/ ../../../data/jhmdb
+```
+
+## 检查文件夹结构
+
+完成解压后，用户将得到 `FlowBrox04` 文件夹，`Frames` 文件夹和 `JHMDB-GT.pkl` 文件。
+
+在整个 MMAction2 文件夹下，JHMDB 的文件结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── jhmdb
+│   |   ├── FlowBrox04
+│   |   |   ├── brush_hair
+│   |   |   |   ├── April_09_brush_hair_u_nm_np1_ba_goo_0
+│   |   |   |   |   ├── 00001.jpg
+│   |   |   |   |   ├── 00002.jpg
+│   |   |   |   |   ├── ...
+│   |   |   |   |   ├── 00039.jpg
+│   |   |   |   |   ├── 00040.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── Trannydude___Brushing_SyntheticHair___OhNOES!__those_fukin_knots!_brush_hair_u_nm_np1_fr_goo_2
+│   |   |   ├── ...
+│   |   |   ├── wave
+│   |   |   |   ├── 21_wave_u_nm_np1_fr_goo_5
+│   |   |   |   ├── ...
+│   |   |   |   ├── Wie_man_winkt!!_wave_u_cm_np1_fr_med_0
+│   |   ├── Frames
+│   |   |   ├── brush_hair
+│   |   |   |   ├── April_09_brush_hair_u_nm_np1_ba_goo_0
+│   |   |   |   |   ├── 00001.png
+│   |   |   |   |   ├── 00002.png
+│   |   |   |   |   ├── ...
+│   |   |   |   |   ├── 00039.png
+│   |   |   |   |   ├── 00040.png
+│   |   |   |   ├── ...
+│   |   |   |   ├── Trannydude___Brushing_SyntheticHair___OhNOES!__those_fukin_knots!_brush_hair_u_nm_np1_fr_goo_2
+│   |   |   ├── ...
+│   |   |   ├── wave
+│   |   |   |   ├── 21_wave_u_nm_np1_fr_goo_5
+│   |   |   |   ├── ...
+│   |   |   |   ├── Wie_man_winkt!!_wave_u_cm_np1_fr_med_0
+│   |   ├── JHMDB-GT.pkl
+
+```
+
+**注意**：`JHMDB-GT.pkl` 作为一个缓存文件，它包含 6 个项目：
+
+1. `labels` (list)：21 个行为类别名称组成的列表
+2. `gttubes` (dict)：每个视频对应的基准 tubes 组成的字典
+   **gttube** 是由标签索引和 tube 列表组成的字典
+   **tube** 是一个 `nframes` 行和 5 列的 numpy array，每一列的形式如 `<frame index> <x1> <y1> <x2> <y2>`
+3. `nframes` (dict)：用以表示每个视频对应的帧数，如 `'walk/Panic_in_the_Streets_walk_u_cm_np1_ba_med_5': 16`
+4. `train_videos` (list)：包含 `nsplits=1` 的元素，每一项都包含了训练视频的列表
+5. `test_videos` (list)：包含 `nsplits=1` 的元素，每一项都包含了测试视频的列表
+6. `resolution` (dict)：每个视频对应的分辨率（形如 (h,w)），如 `'pour/Bartender_School_Students_Practice_pour_u_cm_np1_fr_med_1': (240, 320)`
diff --git a/tools/data/kinetics/README.md b/tools/data/kinetics/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7c5bb8bb6166ff42b48c99f204a72b14fd0a29d8
--- /dev/null
+++ b/tools/data/kinetics/README.md
@@ -0,0 +1,184 @@
+# Preparing Kinetics-\[400/600/700\]
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{inproceedings,
+  author = {Carreira, J. and Zisserman, Andrew},
+  year = {2017},
+  month = {07},
+  pages = {4724-4733},
+  title = {Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset},
+  doi = {10.1109/CVPR.2017.502}
+}
+```
+
+For basic dataset information, please refer to the official [website](https://deepmind.com/research/open-source/open-source-datasets/kinetics/).
+
+:::{note}
+Because of the expirations of some YouTube links, the sizes of kinetics dataset copies may be different. Here are the sizes of our kinetics dataset copies that used to train all checkpoints.
+
+|   Dataset   | training videos | validation videos |
+| :---------: | :-------------: | :---------------: |
+| kinetics400 |     240436      |       19796       |
+| Kinetics600 |     383393      |       27910       |
+| Kinetics700 |     542357      |       34824       |
+
+:::
+
+`````{tabs}
+
+````{group-tab} Download by MIM
+:::{note}
+All experiments on Kinetics in MMAction2 are based on this version, we recommend users to try this version.
+:::
+
+MIM supports downloading from OpenDataLab and preprocessing Kinetics-400/600/700 dataset with one command line.
+
+```Bash
+# install OpenXlab CLI tools
+pip install -U openxlab
+# log in OpenXLab
+openxlab login
+# download and preprocess Kinetics-400 by MIM. Note that this might take a long time.
+mim download mmaction2 --dataset kinetics400
+# download and preprocess Kinetics-600 by MIM. Note that this might take a long time.
+mim download mmaction2 --dataset kinetics600
+# download and preprocess Kinetics-700 by MIM. Note that this might take a long time.
+mim download mmaction2 --dataset kinetics700
+
+```
+
+````
+
+````{group-tab} Download form Official Source
+
+## Step 1. Prepare Annotations
+
+The scripts can be used for preparing kinetics400, kinetics600, kinetics700. To prepare different version of kinetics, you need to replace `${DATASET}` in the following examples with the specific dataset name. The choices of dataset names are `kinetics400`, `kinetics600` and `kinetics700`.
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/${DATASET}/`.
+
+First of all, you can run the following script to prepare annotations by downloading from the official [website](https://deepmind.com/research/open-source/open-source-datasets/kinetics/).
+
+```shell
+bash download_annotations.sh ${DATASET}
+```
+
+Since some video urls are invalid, the number of video items in current official annotations are less than the original official ones.
+So we provide an alternative way to download the older one as a reference.
+Among these, the annotation files of Kinetics400 and Kinetics600 are from [official crawler](https://github.com/activitynet/ActivityNet/tree/199c9358907928a47cdfc81de4db788fddc2f91d/Crawler/Kinetics/data),
+the annotation files of Kinetics700 are from [website](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) downloaded in 05/02/2021.
+
+```shell
+bash download_backup_annotations.sh ${DATASET}
+```
+
+## Step 2. Prepare Videos
+
+you can run the following script to prepare videos.
+The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time.
+
+```shell
+bash download_videos.sh ${DATASET}
+```
+
+**Important**: If you have already downloaded video dataset using the download script above,
+you must replace all whitespaces in the class name for ease of processing by running
+
+```shell
+bash rename_classnames.sh ${DATASET}
+```
+
+For better decoding speed, you can resize the original videos into smaller sized, densely encoded version by:
+
+```bash
+python ../resize_videos.py ../../../data/${DATASET}/videos_train/ ../../../data/${DATASET}/videos_train_256p_dense_cache --dense --level 2
+```
+
+You can also download from [Academic Torrents](https://academictorrents.com/) ([kinetics400](https://academictorrents.com/details/184d11318372f70018cf9a72ef867e2fb9ce1d26) & [kinetics700](https://academictorrents.com/details/49f203189fb69ae96fb40a6d0e129949e1dfec98) with short edge 256 pixels are available) and [cvdfoundation/kinetics-dataset](https://github.com/cvdfoundation/kinetics-dataset) (Host by Common Visual Data Foundation and Kinetics400/Kinetics600/Kinetics-700-2020 are available)
+
+## Step 3. Extract RGB and Flow
+
+This part is **optional** if you only want to use the video loader.
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. And you can run the following script to soft link the extracted frames.
+
+```shell
+# execute these two line (Assume the SSD is mounted at "/mnt/SSD/")
+mkdir /mnt/SSD/${DATASET}_extracted_train/
+ln -s /mnt/SSD/${DATASET}_extracted_train/ ../../../data/${DATASET}/rawframes_train/
+mkdir /mnt/SSD/${DATASET}_extracted_val/
+ln -s /mnt/SSD/${DATASET}_extracted_val/ ../../../data/${DATASET}/rawframes_val/
+```
+
+If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow.
+
+```shell
+bash extract_rgb_frames.sh ${DATASET}
+```
+
+If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images.
+
+```shell
+bash extract_rgb_frames_opencv.sh ${DATASET}
+```
+
+If both are required, run the following script to extract frames.
+
+```shell
+bash extract_frames.sh ${DATASET}
+```
+
+The commands above can generate images with new short edge 256. If you want to generate images with short edge 320 (320p), or with fix size 340x256, you can change the args `--new-short 256` to `--new-short 320` or `--new-width 340 --new-height 256`.
+More details can be found in [prepare dataset](/docs/en/user_guides/prepare_dataset.md).
+
+## Step 4. Generate File List
+
+you can run the follow scripts to generate file list in the format of videos and rawframes, respectively.
+
+```shell
+bash generate_videos_filelist.sh ${DATASET}
+# execute the command below when rawframes are ready
+bash generate_rawframes_filelist.sh ${DATASET}
+```
+
+````
+`````
+
+### Folder Structure
+
+After the whole data pipeline for Kinetics preparation.
+you can get the rawframes (RGB + Flow), videos and annotation files for Kinetics.
+
+In the context of the whole project (for Kinetics only), the *minimal* folder structure will look like:
+(*minimal* means that some data are not necessary: for example, you may want to evaluate kinetics using the original video format.)
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── ${DATASET}
+│   │   ├── ${DATASET}_train_list_videos.txt
+│   │   ├── ${DATASET}_val_list_videos.txt
+│   │   ├── annotations
+│   │   ├── videos_train
+│   │   ├── videos_val
+│   │   │   ├── abseiling
+│   │   │   │   ├── 0wR5jVB-WPk_000417_000427.mp4
+│   │   │   │   ├── ...
+│   │   │   ├── ...
+│   │   │   ├── wrapping_present
+│   │   │   ├── ...
+│   │   │   ├── zumba
+│   │   ├── rawframes_train (optional)
+│   │   ├── rawframes_val (optional)
+
+```
+
+For training and evaluating on Kinetics, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/kinetics/README_zh-CN.md b/tools/data/kinetics/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..bf81e28c012425864a88daf31ccf31c7bd354fbf
--- /dev/null
+++ b/tools/data/kinetics/README_zh-CN.md
@@ -0,0 +1,174 @@
+# 准备 Kinetics-\[400/600/700\]
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{inproceedings,
+  author = {Carreira, J. and Zisserman, Andrew},
+  year = {2017},
+  month = {07},
+  pages = {4724-4733},
+  title = {Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset},
+  doi = {10.1109/CVPR.2017.502}
+}
+```
+
+请参照 [官方网站](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) 以获取数据集基本信息。
+
+:::{note}
+由于部分 YouTube 链接失效，爬取的 Kinetics 数据集大小可能与原版不同。以下是我们所使用 Kinetics 数据集的大小：
+
+|   数据集    | 训练视频 | 验证集视频 |
+| :---------: | :------: | :--------: |
+| Kinetics400 |  240436  |   19796    |
+| Kinetics600 |  383393  |   27910    |
+| Kinetics700 |  542357  |   34824    |
+|     :::     |          |            |
+
+`````{tabs}
+
+````{group-tab} 使用 MIM 下载
+:::{note}
+MMAction2 代码仓库中提供的 Kinetics 实验性能，都是基于这个版本的数据得到的。我们建议用户使用这个版本的 Kinetics 数据集进行实验。
+:::
+
+# MIM 支持下载 Kinetics-400/600/700 数据集。用户可以通过一行命令，从 OpenDataLab 进行下载，并进行预处理。
+```Bash
+# 安装 OpenXLab CLI 工具
+pip install -U openxlab
+# 登录 OpenXLab
+openxlab login
+# 通过 MIM 进行 Kinetics-400 数据集下载，预处理。注意这将花费较长时间
+mim download mmaction2 --dataset kinetics400
+# 通过 MIM 进行 Kinetics-600 数据集下载，预处理。注意这将花费较长时间
+mim download mmaction2 --dataset kinetics600
+# 通过 MIM 进行 Kinetics-700 数据集下载，预处理。注意这将花费较长时间
+mim download mmaction2 --dataset kinetics700
+```
+
+````
+
+````{group-tab} 从官方源下载
+## 1. 准备标注文件
+
+此脚本用于准备数据集 kinetics400，kinetics600，kinetics700。为准备 kinetics 数据集的不同版本，用户需将脚本中的 `${DATASET}` 赋值为数据集对应版本名称，可选项为 `kinetics400`，`kinetics600`， `kinetics700`。
+在开始之前，用户需确保当前目录为 `$MMACTION2/tools/data/${DATASET}/`。
+首先，用户可以使用如下脚本从 [Kinetics 数据集官网](https://deepmind.com/research/open-source/open-source-datasets/kinetics/)下载标注文件并进行预处理：
+
+```shell
+bash download_annotations.sh ${DATASET}
+```
+
+由于部分视频的 URL 不可用，当前官方标注中所含视频数量可能小于初始版本。所以 MMAction2 提供了另一种方式以获取初始版本标注作为参考。
+在这其中，Kinetics400 和 Kinetics600 的标注文件来自 [官方爬虫](https://github.com/activitynet/ActivityNet/tree/199c9358907928a47cdfc81de4db788fddc2f91d/Crawler/Kinetics/data)，
+Kinetics700 的标注文件于 05/02/2021 下载自 [网站](https://deepmind.com/research/open-source/open-source-datasets/kinetics/)。
+
+```shell
+bash download_backup_annotations.sh ${DATASET}
+```
+
+## 2. 准备视频
+
+用户可以使用以下脚本准备视频，视频准备代码修改自 [官方爬虫](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)。注意这一步骤将花费较长时间。
+
+```shell
+bash download_videos.sh ${DATASET}
+```
+
+**重要提示**：如果在此之前已下载好 Kinetics 数据集的视频，还需使用重命名脚本来替换掉类名中的空格：
+
+```shell
+bash rename_classnames.sh ${DATASET}
+```
+
+为提升解码速度，用户可以使用以下脚本将原始视频缩放至更小的分辨率（利用稠密编码）：
+
+```bash
+python ../resize_videos.py ../../../data/${DATASET}/videos_train/ ../../../data/${DATASET}/videos_train_256p_dense_cache --dense --level 2
+```
+
+也可以从 [Academic Torrents](https://academictorrents.com/) 中下载短边长度为 256 的 [kinetics400](https://academictorrents.com/details/184d11318372f70018cf9a72ef867e2fb9ce1d26) 和 [kinetics700](https://academictorrents.com/details/49f203189fb69ae96fb40a6d0e129949e1dfec98)，或从 Common Visual Data Foundation 维护的 [cvdfoundation/kinetics-dataset](https://github.com/cvdfoundation/kinetics-dataset) 中下载 Kinetics400/Kinetics600/Kinetics-700-2020。
+
+## 3. 提取 RGB 帧和光流
+
+如果用户仅使用 video loader，则可以跳过本步。
+
+在提取之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+如果用户有足够的 SSD 空间，那么建议将视频抽取为 RGB 帧以提升 I/O 性能。用户可以使用以下脚本为抽取得到的帧文件夹建立软连接：
+
+```shell
+# 执行以下脚本 (假设 SSD 被挂载在 "/mnt/SSD/")
+mkdir /mnt/SSD/${DATASET}_extracted_train/
+ln -s /mnt/SSD/${DATASET}_extracted_train/ ../../../data/${DATASET}/rawframes_train/
+mkdir /mnt/SSD/${DATASET}_extracted_val/
+ln -s /mnt/SSD/${DATASET}_extracted_val/ ../../../data/${DATASET}/rawframes_val/
+```
+
+如果用户只使用 RGB 帧（由于光流提取非常耗时），可以考虑执行以下脚本，仅用 denseflow 提取 RGB 帧：
+
+```shell
+bash extract_rgb_frames.sh ${DATASET}
+```
+
+如果用户未安装 denseflow，以下脚本可以使用 OpenCV 进行 RGB 帧的提取，但视频原分辨率大小会被保留：
+
+```shell
+bash extract_rgb_frames_opencv.sh ${DATASET}
+```
+
+如果同时需要 RGB 帧和光流，可使用如下脚本抽帧：
+
+```shell
+bash extract_frames.sh ${DATASET}
+```
+
+以上的命令生成短边长度为 256 的 RGB 帧和光流帧。如果用户需要生成短边长度为 320 的帧 (320p)，或是固定分辨率为 340 x 256 的帧，可改变参数 `--new-short 256` 为 `--new-short 320` 或 `--new-width 340 --new-height 256`。
+更多细节可以参考 [数据准备](/docs/zh_cn/user_guides/prepare_dataset.md)。
+
+## 4. 生成文件列表
+
+用户可以使用以下两个脚本分别为视频和帧文件夹生成文件列表：
+
+```shell
+bash generate_videos_filelist.sh ${DATASET}
+# 为帧文件夹生成文件列表
+bash generate_rawframes_filelist.sh ${DATASET}
+```
+
+````
+`````
+
+### 目录结构
+
+在完整完成 Kinetics 的数据处理后，将得到帧文件夹（RGB 帧和光流帧），视频以及标注文件。
+
+在整个项目目录下（仅针对 Kinetics），*最简* 目录结构如下所示：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── ${DATASET}
+│   │   ├── ${DATASET}_train_list_videos.txt
+│   │   ├── ${DATASET}_val_list_videos.txt
+│   │   ├── annotations（可选）
+│   │   ├── videos_train
+│   │   ├── videos_val
+│   │   │   ├── abseiling
+│   │   │   │   ├── 0wR5jVB-WPk_000417_000427.mp4
+│   │   │   │   ├── ...
+│   │   │   ├── ...
+│   │   │   ├── wrapping_present
+│   │   │   ├── ...
+│   │   │   ├── zumba
+│   │   ├── rawframes_train（可选）
+│   │   ├── rawframes_val（可选）
+
+```
+
+关于 Kinetics 数据集上的训练与测试，请参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/kinetics/download.py b/tools/data/kinetics/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e152eb7cb6fda48a8e6d8deeb762fadb4c95e95
--- /dev/null
+++ b/tools/data/kinetics/download.py
@@ -0,0 +1,230 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/activitynet/ActivityNet/
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+import argparse
+import glob
+import json
+import os
+import shutil
+import ssl
+import subprocess
+import uuid
+from collections import OrderedDict
+
+import pandas as pd
+from joblib import Parallel, delayed
+
+ssl._create_default_https_context = ssl._create_unverified_context
+
+
+def create_video_folders(dataset, output_dir, tmp_dir):
+    """Creates a directory for each label name in the dataset."""
+    if 'label-name' not in dataset.columns:
+        this_dir = os.path.join(output_dir, 'test')
+        if not os.path.exists(this_dir):
+            os.makedirs(this_dir)
+        # I should return a dict but ...
+        return this_dir
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+
+    label_to_dir = {}
+    for label_name in dataset['label-name'].unique():
+        this_dir = os.path.join(output_dir, label_name)
+        if not os.path.exists(this_dir):
+            os.makedirs(this_dir)
+        label_to_dir[label_name] = this_dir
+    return label_to_dir
+
+
+def construct_video_filename(row, label_to_dir, trim_format='%06d'):
+    """Given a dataset row, this function constructs the output filename for a
+    given video."""
+    basename = '%s_%s_%s.mp4' % (row['video-id'],
+                                 trim_format % row['start-time'],
+                                 trim_format % row['end-time'])
+    if not isinstance(label_to_dir, dict):
+        dirname = label_to_dir
+    else:
+        dirname = label_to_dir[row['label-name']]
+    output_filename = os.path.join(dirname, basename)
+    return output_filename
+
+
+def download_clip(video_identifier,
+                  output_filename,
+                  start_time,
+                  end_time,
+                  tmp_dir='/tmp/kinetics/.tmp_dir',
+                  num_attempts=5,
+                  url_base='https://www.youtube.com/watch?v='):
+    """Download a video from youtube if exists and is not blocked.
+    arguments:
+    ---------
+    video_identifier: str
+        Unique YouTube video identifier (11 characters)
+    output_filename: str
+        File path where the video will be stored.
+    start_time: float
+        Indicates the beginning time in seconds from where the video
+        will be trimmed.
+    end_time: float
+        Indicates the ending time in seconds of the trimmed video.
+    """
+    # Defensive argument checking.
+    assert isinstance(video_identifier, str), 'video_identifier must be string'
+    assert isinstance(output_filename, str), 'output_filename must be string'
+    assert len(video_identifier) == 11, 'video_identifier must have length 11'
+
+    status = False
+    # Construct command line for getting the direct video link.
+    tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4())
+
+    if not os.path.exists(output_filename):
+        if not os.path.exists(tmp_filename):
+            command = [
+                'youtube-dl', '--quiet', '--no-warnings',
+                '--no-check-certificate', '-f', 'mp4', '-o',
+                '"%s"' % tmp_filename,
+                '"%s"' % (url_base + video_identifier)
+            ]
+            command = ' '.join(command)
+            print(command)
+            attempts = 0
+            while True:
+                try:
+                    subprocess.check_output(
+                        command, shell=True, stderr=subprocess.STDOUT)
+                except subprocess.CalledProcessError as err:
+                    attempts += 1
+                    if attempts == num_attempts:
+                        return status, err.output
+                else:
+                    break
+
+        tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0]
+        # Construct command to trim the videos (ffmpeg required).
+        command = [
+            'ffmpeg', '-i',
+            '"%s"' % tmp_filename, '-ss',
+            str(start_time), '-t',
+            str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy',
+            '-threads', '1', '-loglevel', 'panic',
+            '"%s"' % output_filename
+        ]
+        command = ' '.join(command)
+        try:
+            subprocess.check_output(
+                command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as err:
+            return status, err.output
+
+    # Check if the video was successfully saved.
+    status = os.path.exists(output_filename)
+    os.remove(tmp_filename)
+    return status, 'Downloaded'
+
+
+def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir):
+    """Wrapper for parallel processing purposes."""
+    output_filename = construct_video_filename(row, label_to_dir, trim_format)
+    clip_id = os.path.basename(output_filename).split('.mp4')[0]
+    if os.path.exists(output_filename):
+        status = tuple([clip_id, True, 'Exists'])
+        return status
+
+    downloaded, log = download_clip(
+        row['video-id'],
+        output_filename,
+        row['start-time'],
+        row['end-time'],
+        tmp_dir=tmp_dir)
+    status = tuple([clip_id, downloaded, log])
+    return status
+
+
+def parse_kinetics_annotations(input_csv, ignore_is_cc=False):
+    """Returns a parsed DataFrame.
+    arguments:
+    ---------
+    input_csv: str
+        Path to CSV file containing the following columns:
+          'YouTube Identifier,Start time,End time,Class label'
+    returns:
+    -------
+    dataset: DataFrame
+        Pandas with the following columns:
+            'video-id', 'start-time', 'end-time', 'label-name'
+    """
+    df = pd.read_csv(input_csv)
+    if 'youtube_id' in df.columns:
+        columns = OrderedDict([('youtube_id', 'video-id'),
+                               ('time_start', 'start-time'),
+                               ('time_end', 'end-time'),
+                               ('label', 'label-name')])
+        df.rename(columns=columns, inplace=True)
+        if ignore_is_cc:
+            df = df.loc[:, df.columns.tolist()[:-1]]
+    return df
+
+
+def main(input_csv,
+         output_dir,
+         trim_format='%06d',
+         num_jobs=24,
+         tmp_dir='/tmp/kinetics'):
+    tmp_dir = os.path.join(tmp_dir, '.tmp_dir')
+
+    # Reading and parsing Kinetics.
+    dataset = parse_kinetics_annotations(input_csv)
+
+    # Creates folders where videos will be saved later.
+    label_to_dir = create_video_folders(dataset, output_dir, tmp_dir)
+
+    # Download all clips.
+    if num_jobs == 1:
+        status_list = []
+        for _, row in dataset.iterrows():
+            status_list.append(
+                download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir))
+    else:
+        status_list = Parallel(
+            n_jobs=num_jobs)(delayed(download_clip_wrapper)(
+                row, label_to_dir, trim_format, tmp_dir)
+                             for i, row in dataset.iterrows())
+
+    # Clean tmp dir.
+    shutil.rmtree(tmp_dir)
+
+    # Save download report.
+    with open('download_report.json', 'w') as fobj:
+        fobj.write(json.dumps(status_list))
+
+
+if __name__ == '__main__':
+    description = 'Helper script for downloading and trimming kinetics videos.'
+    p = argparse.ArgumentParser(description=description)
+    p.add_argument(
+        'input_csv',
+        type=str,
+        help=('CSV file containing the following format: '
+              'YouTube Identifier,Start time,End time,Class label'))
+    p.add_argument(
+        'output_dir',
+        type=str,
+        help='Output directory where videos will be saved.')
+    p.add_argument(
+        '-f',
+        '--trim-format',
+        type=str,
+        default='%06d',
+        help=('This will be the format for the '
+              'filename of trimmed videos: '
+              'videoid_%0xd(start_time)_%0xd(end_time).mp4'))
+    p.add_argument('-n', '--num-jobs', type=int, default=24)
+    p.add_argument('-t', '--tmp-dir', type=str, default='/tmp/kinetics')
+    # help='CSV file of the previous version of Kinetics.')
+    main(**vars(p.parse_args()))
diff --git a/tools/data/kinetics/download_annotations.sh b/tools/data/kinetics/download_annotations.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1afc7b110b6c5aeb924e326248ad359e5a14a3cc
--- /dev/null
+++ b/tools/data/kinetics/download_annotations.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+DATASET=$1
+if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then
+        echo "We are processing $DATASET"
+else
+        echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700"
+        exit 0
+fi
+
+DATA_DIR="../../../data/${DATASET}/annotations"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget https://storage.googleapis.com/deepmind-media/Datasets/${DATASET}.tar.gz
+
+tar -zxvf ${DATASET}.tar.gz --strip-components 1 -C ${DATA_DIR}/
+mv ${DATA_DIR}/train.csv ${DATA_DIR}/kinetics_train.csv
+mv ${DATA_DIR}/validate.csv ${DATA_DIR}/kinetics_val.csv
+mv ${DATA_DIR}/test.csv ${DATA_DIR}/kinetics_test.csv
+
+rm ${DATASET}.tar.gz
+rm ${DATA_DIR}/*.json
diff --git a/tools/data/kinetics/download_backup_annotations.sh b/tools/data/kinetics/download_backup_annotations.sh
new file mode 100644
index 0000000000000000000000000000000000000000..67745b5a0844d825d6338a108683a2154eafd70f
--- /dev/null
+++ b/tools/data/kinetics/download_backup_annotations.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+DATASET=$1
+if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then
+        echo "We are processing $DATASET"
+else
+        echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700"
+        exit 0
+fi
+
+DATA_DIR="../../../data/${DATASET}/annotations"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+
+wget https://download.openmmlab.com/mmaction/dataset/${DATASET}/annotations/kinetics_train.csv
+wget https://download.openmmlab.com/mmaction/dataset/${DATASET}/annotations/kinetics_val.csv
+wget https://download.openmmlab.com/mmaction/dataset/${DATASET}/annotations/kinetics_test.csv
+
+mv kinetics_train.csv ${DATA_DIR}/kinetics_train.csv
+mv kinetics_val.csv ${DATA_DIR}/kinetics_val.csv
+mv kinetics_test.csv ${DATA_DIR}/kinetics_test.csv
diff --git a/tools/data/kinetics/download_videos.sh b/tools/data/kinetics/download_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..deb8094c20a4c6ce286950d662e455376f1e8349
--- /dev/null
+++ b/tools/data/kinetics/download_videos.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# set up environment
+conda env create -f environment.yml
+source activate kinetics
+pip install --upgrade youtube-dl
+
+DATASET=$1
+if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then
+        echo "We are processing $DATASET"
+else
+        echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700"
+        exit 0
+fi
+
+DATA_DIR="../../../data/${DATASET}"
+ANNO_DIR="../../../data/${DATASET}/annotations"
+python download.py ${ANNO_DIR}/kinetics_train.csv ${DATA_DIR}/videos_train
+python download.py ${ANNO_DIR}/kinetics_val.csv ${DATA_DIR}/videos_val
+
+source deactivate kinetics
+conda remove -n kinetics --all
diff --git a/tools/data/kinetics/environment.yml b/tools/data/kinetics/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b6d9959e88a91881de1be2d38928c63e9aa79938
--- /dev/null
+++ b/tools/data/kinetics/environment.yml
@@ -0,0 +1,36 @@
+name: kinetics
+channels:
+  - anaconda
+  - menpo
+  - conda-forge
+  - defaults
+dependencies:
+  - ca-certificates=2020.1.1
+  - certifi=2020.4.5.1
+  - ffmpeg=2.8.6
+  - libcxx=10.0.0
+  - libedit=3.1.20181209
+  - libffi=3.3
+  - ncurses=6.2
+  - openssl=1.1.1g
+  - pip=20.0.2
+  - python=3.7.7
+  - readline=8.0
+  - setuptools=46.4.0
+  - sqlite=3.31.1
+  - tk=8.6.8
+  - wheel=0.34.2
+  - xz=5.2.5
+  - zlib=1.2.11
+  - pip:
+    - decorator==4.4.2
+    - intel-openmp==2019.0
+    - joblib==0.15.1
+    - mkl==2019.0
+    - numpy==1.18.4
+    - olefile==0.46
+    - pandas==1.0.3
+    - python-dateutil==2.8.1
+    - pytz==2020.1
+    - six==1.14.0
+    - youtube-dl
diff --git a/tools/data/kinetics/extract_frames.sh b/tools/data/kinetics/extract_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9fd1c51b3e56926b7db061f7277b379c0a5e0224
--- /dev/null
+++ b/tools/data/kinetics/extract_frames.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+DATASET=$1
+if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then
+        echo "We are processing $DATASET"
+else
+        echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700"
+        exit 0
+fi
+
+cd ../
+python build_rawframes.py ../../data/${DATASET}/videos_train/ ../../data/${DATASET}/rawframes_train/ --level 2 --flow-type tvl1 --ext mp4 --task both  --new-short 256
+echo "Raw frames (RGB and tv-l1) Generated for train set"
+
+python build_rawframes.py ../../data/${DATASET}/videos_val/ ../../data/${DATASET}/rawframes_val/ --level 2 --flow-type tvl1 --ext mp4 --task both  --new-short 256
+echo "Raw frames (RGB and tv-l1) Generated for val set"
+
+cd ${DATASET}/
diff --git a/tools/data/kinetics/extract_rgb_frames.sh b/tools/data/kinetics/extract_rgb_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..64997930303e74b8b636de59f8f174e822f29a19
--- /dev/null
+++ b/tools/data/kinetics/extract_rgb_frames.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+DATASET=$1
+if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then
+        echo "We are processing $DATASET"
+else
+        echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700"
+        exit 0
+fi
+
+cd ../
+python build_rawframes.py ../../data/${DATASET}/videos_train/ ../../data/${DATASET}/rawframes_train/ --level 2  --ext mp4 --task rgb  --new-short 256
+echo "Raw frames (RGB only) generated for train set"
+
+python build_rawframes.py ../../data/${DATASET}/videos_val/ ../../data/${DATASET}/rawframes_val/ --level 2 --ext mp4 --task rgb  --new-short 256
+echo "Raw frames (RGB only) generated for val set"
+
+cd ${DATASET}/
diff --git a/tools/data/kinetics/extract_rgb_frames_opencv.sh b/tools/data/kinetics/extract_rgb_frames_opencv.sh
new file mode 100644
index 0000000000000000000000000000000000000000..aa066e4307eb95e61eab33e1bfab8b8eb2f85b1a
--- /dev/null
+++ b/tools/data/kinetics/extract_rgb_frames_opencv.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+DATASET=$1
+if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then
+        echo "We are processing $DATASET"
+else
+        echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700"
+        exit 0
+fi
+
+cd ../
+python build_rawframes.py ../../data/${DATASET}/videos_train/ ../../data/${DATASET}/rawframes_train/ --level 2  --ext mp4 --task rgb --new-short 256 --use-opencv
+echo "Raw frames (RGB only) generated for train set"
+
+python build_rawframes.py ../../data/${DATASET}/videos_val/ ../../data/${DATASET}/rawframes_val/ --level 2 --ext mp4 --task rgb --new-short 256 --use-opencv
+echo "Raw frames (RGB only) generated for val set"
+
+cd ${DATASET}/
diff --git a/tools/data/kinetics/generate_rawframes_filelist.sh b/tools/data/kinetics/generate_rawframes_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dc41bfc69fbb86c2439eaff2d6c717a691679c43
--- /dev/null
+++ b/tools/data/kinetics/generate_rawframes_filelist.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+DATASET=$1
+if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then
+        echo "We are processing $DATASET"
+else
+        echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700"
+        exit 0
+fi
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py ${DATASET} data/${DATASET}/rawframes_train/ --level 2 --format rawframes --num-split 1 --subset train --shuffle
+echo "Train filelist for rawframes generated."
+
+PYTHONPATH=. python tools/data/build_file_list.py ${DATASET} data/${DATASET}/rawframes_val/ --level 2 --format rawframes --num-split 1 --subset val --shuffle
+echo "Val filelist for rawframes generated."
+cd tools/data/${DATASET}/
diff --git a/tools/data/kinetics/generate_videos_filelist.sh b/tools/data/kinetics/generate_videos_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..252e828428173bb4c07e5ea2d5436899e3712bc5
--- /dev/null
+++ b/tools/data/kinetics/generate_videos_filelist.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+DATASET=$1
+if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then
+        echo "We are processing $DATASET"
+else
+        echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700"
+        exit 0
+fi
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py ${DATASET} data/${DATASET}/videos_train/ --level 2 --format videos --num-split 1 --subset train --shuffle
+echo "Train filelist for video generated."
+
+PYTHONPATH=. python tools/data/build_file_list.py ${DATASET} data/${DATASET}/videos_val/ --level 2 --format videos --num-split 1 --subset val --shuffle
+echo "Val filelist for video generated."
+cd tools/data/kinetics/
diff --git a/tools/data/kinetics/label_map_k400.txt b/tools/data/kinetics/label_map_k400.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9193a07c6bda30b85b591da52e5e4cb375c31c06
--- /dev/null
+++ b/tools/data/kinetics/label_map_k400.txt
@@ -0,0 +1,400 @@
+abseiling
+air drumming
+answering questions
+applauding
+applying cream
+archery
+arm wrestling
+arranging flowers
+assembling computer
+auctioning
+baby waking up
+baking cookies
+balloon blowing
+bandaging
+barbequing
+bartending
+beatboxing
+bee keeping
+belly dancing
+bench pressing
+bending back
+bending metal
+biking through snow
+blasting sand
+blowing glass
+blowing leaves
+blowing nose
+blowing out candles
+bobsledding
+bookbinding
+bouncing on trampoline
+bowling
+braiding hair
+breading or breadcrumbing
+breakdancing
+brush painting
+brushing hair
+brushing teeth
+building cabinet
+building shed
+bungee jumping
+busking
+canoeing or kayaking
+capoeira
+carrying baby
+cartwheeling
+carving pumpkin
+catching fish
+catching or throwing baseball
+catching or throwing frisbee
+catching or throwing softball
+celebrating
+changing oil
+changing wheel
+checking tires
+cheerleading
+chopping wood
+clapping
+clay pottery making
+clean and jerk
+cleaning floor
+cleaning gutters
+cleaning pool
+cleaning shoes
+cleaning toilet
+cleaning windows
+climbing a rope
+climbing ladder
+climbing tree
+contact juggling
+cooking chicken
+cooking egg
+cooking on campfire
+cooking sausages
+counting money
+country line dancing
+cracking neck
+crawling baby
+crossing river
+crying
+curling hair
+cutting nails
+cutting pineapple
+cutting watermelon
+dancing ballet
+dancing charleston
+dancing gangnam style
+dancing macarena
+deadlifting
+decorating the christmas tree
+digging
+dining
+disc golfing
+diving cliff
+dodgeball
+doing aerobics
+doing laundry
+doing nails
+drawing
+dribbling basketball
+drinking
+drinking beer
+drinking shots
+driving car
+driving tractor
+drop kicking
+drumming fingers
+dunking basketball
+dying hair
+eating burger
+eating cake
+eating carrots
+eating chips
+eating doughnuts
+eating hotdog
+eating ice cream
+eating spaghetti
+eating watermelon
+egg hunting
+exercising arm
+exercising with an exercise ball
+extinguishing fire
+faceplanting
+feeding birds
+feeding fish
+feeding goats
+filling eyebrows
+finger snapping
+fixing hair
+flipping pancake
+flying kite
+folding clothes
+folding napkins
+folding paper
+front raises
+frying vegetables
+garbage collecting
+gargling
+getting a haircut
+getting a tattoo
+giving or receiving award
+golf chipping
+golf driving
+golf putting
+grinding meat
+grooming dog
+grooming horse
+gymnastics tumbling
+hammer throw
+headbanging
+headbutting
+high jump
+high kick
+hitting baseball
+hockey stop
+holding snake
+hopscotch
+hoverboarding
+hugging
+hula hooping
+hurdling
+hurling (sport)
+ice climbing
+ice fishing
+ice skating
+ironing
+javelin throw
+jetskiing
+jogging
+juggling balls
+juggling fire
+juggling soccer ball
+jumping into pool
+jumpstyle dancing
+kicking field goal
+kicking soccer ball
+kissing
+kitesurfing
+knitting
+krumping
+laughing
+laying bricks
+long jump
+lunge
+making a cake
+making a sandwich
+making bed
+making jewelry
+making pizza
+making snowman
+making sushi
+making tea
+marching
+massaging back
+massaging feet
+massaging legs
+massaging person's head
+milking cow
+mopping floor
+motorcycling
+moving furniture
+mowing lawn
+news anchoring
+opening bottle
+opening present
+paragliding
+parasailing
+parkour
+passing American football (in game)
+passing American football (not in game)
+peeling apples
+peeling potatoes
+petting animal (not cat)
+petting cat
+picking fruit
+planting trees
+plastering
+playing accordion
+playing badminton
+playing bagpipes
+playing basketball
+playing bass guitar
+playing cards
+playing cello
+playing chess
+playing clarinet
+playing controller
+playing cricket
+playing cymbals
+playing didgeridoo
+playing drums
+playing flute
+playing guitar
+playing harmonica
+playing harp
+playing ice hockey
+playing keyboard
+playing kickball
+playing monopoly
+playing organ
+playing paintball
+playing piano
+playing poker
+playing recorder
+playing saxophone
+playing squash or racquetball
+playing tennis
+playing trombone
+playing trumpet
+playing ukulele
+playing violin
+playing volleyball
+playing xylophone
+pole vault
+presenting weather forecast
+pull ups
+pumping fist
+pumping gas
+punching bag
+punching person (boxing)
+push up
+pushing car
+pushing cart
+pushing wheelchair
+reading book
+reading newspaper
+recording music
+riding a bike
+riding camel
+riding elephant
+riding mechanical bull
+riding mountain bike
+riding mule
+riding or walking with horse
+riding scooter
+riding unicycle
+ripping paper
+robot dancing
+rock climbing
+rock scissors paper
+roller skating
+running on treadmill
+sailing
+salsa dancing
+sanding floor
+scrambling eggs
+scuba diving
+setting table
+shaking hands
+shaking head
+sharpening knives
+sharpening pencil
+shaving head
+shaving legs
+shearing sheep
+shining shoes
+shooting basketball
+shooting goal (soccer)
+shot put
+shoveling snow
+shredding paper
+shuffling cards
+side kick
+sign language interpreting
+singing
+situp
+skateboarding
+ski jumping
+skiing (not slalom or crosscountry)
+skiing crosscountry
+skiing slalom
+skipping rope
+skydiving
+slacklining
+slapping
+sled dog racing
+smoking
+smoking hookah
+snatch weight lifting
+sneezing
+sniffing
+snorkeling
+snowboarding
+snowkiting
+snowmobiling
+somersaulting
+spinning poi
+spray painting
+spraying
+springboard diving
+squat
+sticking tongue out
+stomping grapes
+stretching arm
+stretching leg
+strumming guitar
+surfing crowd
+surfing water
+sweeping floor
+swimming backstroke
+swimming breast stroke
+swimming butterfly stroke
+swing dancing
+swinging legs
+swinging on something
+sword fighting
+tai chi
+taking a shower
+tango dancing
+tap dancing
+tapping guitar
+tapping pen
+tasting beer
+tasting food
+testifying
+texting
+throwing axe
+throwing ball
+throwing discus
+tickling
+tobogganing
+tossing coin
+tossing salad
+training dog
+trapezing
+trimming or shaving beard
+trimming trees
+triple jump
+tying bow tie
+tying knot (not on a tie)
+tying tie
+unboxing
+unloading truck
+using computer
+using remote controller (not gaming)
+using segway
+vault
+waiting in line
+walking the dog
+washing dishes
+washing feet
+washing hair
+washing hands
+water skiing
+water sliding
+watering plants
+waxing back
+waxing chest
+waxing eyebrows
+waxing legs
+weaving basket
+welding
+whistling
+windsurfing
+wrapping present
+wrestling
+writing
+yawning
+yoga
+zumba
diff --git a/tools/data/kinetics/label_map_k600.txt b/tools/data/kinetics/label_map_k600.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d6bbf70797ecbfed445cc6b02039f3475ac3a502
--- /dev/null
+++ b/tools/data/kinetics/label_map_k600.txt
@@ -0,0 +1,600 @@
+abseiling
+acting in play
+adjusting glasses
+air drumming
+alligator wrestling
+answering questions
+applauding
+applying cream
+archaeological excavation
+archery
+arguing
+arm wrestling
+arranging flowers
+assembling bicycle
+assembling computer
+attending conference
+auctioning
+backflip (human)
+baking cookies
+bandaging
+barbequing
+bartending
+base jumping
+bathing dog
+battle rope training
+beatboxing
+bee keeping
+belly dancing
+bench pressing
+bending back
+bending metal
+biking through snow
+blasting sand
+blowdrying hair
+blowing bubble gum
+blowing glass
+blowing leaves
+blowing nose
+blowing out candles
+bobsledding
+bodysurfing
+bookbinding
+bottling
+bouncing on bouncy castle
+bouncing on trampoline
+bowling
+braiding hair
+breading or breadcrumbing
+breakdancing
+breaking boards
+breathing fire
+brush painting
+brushing hair
+brushing teeth
+building cabinet
+building lego
+building sandcastle
+building shed
+bull fighting
+bulldozing
+bungee jumping
+burping
+busking
+calculating
+calligraphy
+canoeing or kayaking
+capoeira
+capsizing
+card stacking
+card throwing
+carrying baby
+cartwheeling
+carving ice
+carving pumpkin
+casting fishing line
+catching fish
+catching or throwing baseball
+catching or throwing frisbee
+catching or throwing softball
+celebrating
+changing gear in car
+changing oil
+changing wheel (not on bike)
+checking tires
+cheerleading
+chewing gum
+chiseling stone
+chiseling wood
+chopping meat
+chopping vegetables
+chopping wood
+clam digging
+clapping
+clay pottery making
+clean and jerk
+cleaning gutters
+cleaning pool
+cleaning shoes
+cleaning toilet
+cleaning windows
+climbing a rope
+climbing ladder
+climbing tree
+coloring in
+combing hair
+contact juggling
+contorting
+cooking egg
+cooking on campfire
+cooking sausages (not on barbeque)
+cooking scallops
+cosplaying
+counting money
+country line dancing
+cracking back
+cracking knuckles
+cracking neck
+crawling baby
+crossing eyes
+crossing river
+crying
+cumbia
+curling (sport)
+curling hair
+cutting apple
+cutting nails
+cutting orange
+cutting pineapple
+cutting watermelon
+dancing ballet
+dancing charleston
+dancing gangnam style
+dancing macarena
+deadlifting
+decorating the christmas tree
+delivering mail
+dining
+directing traffic
+disc golfing
+diving cliff
+docking boat
+dodgeball
+doing aerobics
+doing jigsaw puzzle
+doing laundry
+doing nails
+drawing
+dribbling basketball
+drinking shots
+driving car
+driving tractor
+drooling
+drop kicking
+drumming fingers
+dumpster diving
+dunking basketball
+dyeing eyebrows
+dyeing hair
+eating burger
+eating cake
+eating carrots
+eating chips
+eating doughnuts
+eating hotdog
+eating ice cream
+eating spaghetti
+eating watermelon
+egg hunting
+embroidering
+exercising with an exercise ball
+extinguishing fire
+faceplanting
+falling off bike
+falling off chair
+feeding birds
+feeding fish
+feeding goats
+fencing (sport)
+fidgeting
+finger snapping
+fixing bicycle
+fixing hair
+flint knapping
+flipping pancake
+fly tying
+flying kite
+folding clothes
+folding napkins
+folding paper
+front raises
+frying vegetables
+geocaching
+getting a haircut
+getting a piercing
+getting a tattoo
+giving or receiving award
+gold panning
+golf chipping
+golf driving
+golf putting
+gospel singing in church
+grinding meat
+grooming dog
+grooming horse
+gymnastics tumbling
+hammer throw
+hand washing clothes
+head stand
+headbanging
+headbutting
+high jump
+high kick
+historical reenactment
+hitting baseball
+hockey stop
+holding snake
+home roasting coffee
+hopscotch
+hoverboarding
+huddling
+hugging (not baby)
+hugging baby
+hula hooping
+hurdling
+hurling (sport)
+ice climbing
+ice fishing
+ice skating
+ice swimming
+inflating balloons
+installing carpet
+ironing
+ironing hair
+javelin throw
+jaywalking
+jetskiing
+jogging
+juggling balls
+juggling fire
+juggling soccer ball
+jumping bicycle
+jumping into pool
+jumping jacks
+jumpstyle dancing
+karaoke
+kicking field goal
+kicking soccer ball
+kissing
+kitesurfing
+knitting
+krumping
+land sailing
+laughing
+lawn mower racing
+laying bricks
+laying concrete
+laying stone
+laying tiles
+leatherworking
+licking
+lifting hat
+lighting fire
+lock picking
+long jump
+longboarding
+looking at phone
+luge
+lunge
+making a cake
+making a sandwich
+making balloon shapes
+making bubbles
+making cheese
+making horseshoes
+making jewelry
+making paper aeroplanes
+making pizza
+making snowman
+making sushi
+making tea
+making the bed
+marching
+marriage proposal
+massaging back
+massaging feet
+massaging legs
+massaging neck
+massaging person's head
+milking cow
+moon walking
+mopping floor
+mosh pit dancing
+motorcycling
+mountain climber (exercise)
+moving furniture
+mowing lawn
+mushroom foraging
+needle felting
+news anchoring
+opening bottle (not wine)
+opening door
+opening present
+opening refrigerator
+opening wine bottle
+packing
+paragliding
+parasailing
+parkour
+passing American football (in game)
+passing american football (not in game)
+passing soccer ball
+peeling apples
+peeling potatoes
+person collecting garbage
+petting animal (not cat)
+petting cat
+photobombing
+photocopying
+picking fruit
+pillow fight
+pinching
+pirouetting
+planing wood
+planting trees
+plastering
+playing accordion
+playing badminton
+playing bagpipes
+playing basketball
+playing bass guitar
+playing beer pong
+playing blackjack
+playing cello
+playing chess
+playing clarinet
+playing controller
+playing cricket
+playing cymbals
+playing darts
+playing didgeridoo
+playing dominoes
+playing drums
+playing field hockey
+playing flute
+playing gong
+playing guitar
+playing hand clapping games
+playing harmonica
+playing harp
+playing ice hockey
+playing keyboard
+playing kickball
+playing laser tag
+playing lute
+playing maracas
+playing marbles
+playing monopoly
+playing netball
+playing ocarina
+playing organ
+playing paintball
+playing pan pipes
+playing piano
+playing pinball
+playing ping pong
+playing poker
+playing polo
+playing recorder
+playing rubiks cube
+playing saxophone
+playing scrabble
+playing squash or racquetball
+playing tennis
+playing trombone
+playing trumpet
+playing ukulele
+playing violin
+playing volleyball
+playing with trains
+playing xylophone
+poking bellybutton
+pole vault
+polishing metal
+popping balloons
+pouring beer
+preparing salad
+presenting weather forecast
+pull ups
+pumping fist
+pumping gas
+punching bag
+punching person (boxing)
+push up
+pushing car
+pushing cart
+pushing wheelbarrow
+pushing wheelchair
+putting in contact lenses
+putting on eyeliner
+putting on foundation
+putting on lipstick
+putting on mascara
+putting on sari
+putting on shoes
+raising eyebrows
+reading book
+reading newspaper
+recording music
+repairing puncture
+riding a bike
+riding camel
+riding elephant
+riding mechanical bull
+riding mule
+riding or walking with horse
+riding scooter
+riding snow blower
+riding unicycle
+ripping paper
+roasting marshmallows
+roasting pig
+robot dancing
+rock climbing
+rock scissors paper
+roller skating
+rolling pastry
+rope pushdown
+running on treadmill
+sailing
+salsa dancing
+sanding floor
+sausage making
+sawing wood
+scrambling eggs
+scrapbooking
+scrubbing face
+scuba diving
+separating eggs
+setting table
+sewing
+shaking hands
+shaking head
+shaping bread dough
+sharpening knives
+sharpening pencil
+shaving head
+shaving legs
+shearing sheep
+shining flashlight
+shining shoes
+shooting basketball
+shooting goal (soccer)
+shopping
+shot put
+shoveling snow
+shucking oysters
+shuffling cards
+shuffling feet
+side kick
+sign language interpreting
+singing
+sipping cup
+situp
+skateboarding
+ski jumping
+skiing crosscountry
+skiing mono
+skiing slalom
+skipping rope
+skipping stone
+skydiving
+slacklining
+slapping
+sled dog racing
+sleeping
+smashing
+smelling feet
+smoking
+smoking hookah
+smoking pipe
+snatch weight lifting
+sneezing
+snorkeling
+snowboarding
+snowkiting
+snowmobiling
+somersaulting
+spelunking
+spinning poi
+spray painting
+springboard diving
+square dancing
+squat
+standing on hands
+staring
+steer roping
+sticking tongue out
+stomping grapes
+stretching arm
+stretching leg
+sucking lolly
+surfing crowd
+surfing water
+sweeping floor
+swimming backstroke
+swimming breast stroke
+swimming butterfly stroke
+swimming front crawl
+swing dancing
+swinging baseball bat
+swinging on something
+sword fighting
+sword swallowing
+tackling
+tagging graffiti
+tai chi
+talking on cell phone
+tango dancing
+tap dancing
+tapping guitar
+tapping pen
+tasting beer
+tasting food
+tasting wine
+testifying
+texting
+threading needle
+throwing axe
+throwing ball (not baseball or American football)
+throwing discus
+throwing knife
+throwing snowballs
+throwing tantrum
+throwing water balloon
+tickling
+tie dying
+tightrope walking
+tiptoeing
+tobogganing
+tossing coin
+training dog
+trapezing
+trimming or shaving beard
+trimming shrubs
+trimming trees
+triple jump
+twiddling fingers
+tying bow tie
+tying knot (not on a tie)
+tying necktie
+tying shoe laces
+unboxing
+unloading truck
+using a microscope
+using a paint roller
+using a power drill
+using a sledge hammer
+using a wrench
+using atm
+using bagging machine
+using circular saw
+using inhaler
+using puppets
+using remote controller (not gaming)
+using segway
+vacuuming floor
+visiting the zoo
+wading through mud
+wading through water
+waiting in line
+waking up
+walking the dog
+walking through snow
+washing dishes
+washing feet
+washing hair
+washing hands
+watching tv
+water skiing
+water sliding
+watering plants
+waving hand
+waxing back
+waxing chest
+waxing eyebrows
+waxing legs
+weaving basket
+weaving fabric
+welding
+whistling
+windsurfing
+winking
+wood burning (art)
+wrapping present
+wrestling
+writing
+yarn spinning
+yawning
+yoga
+zumba
diff --git a/tools/data/kinetics/label_map_k700.txt b/tools/data/kinetics/label_map_k700.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fbcedf99bf9e4bbb71bb061401e422c98a300956
--- /dev/null
+++ b/tools/data/kinetics/label_map_k700.txt
@@ -0,0 +1,700 @@
+abseiling
+acting in play
+adjusting glasses
+air drumming
+alligator wrestling
+answering questions
+applauding
+applying cream
+archaeological excavation
+archery
+arguing
+arm wrestling
+arranging flowers
+arresting
+assembling bicycle
+assembling computer
+attending conference
+auctioning
+baby waking up
+backflip (human)
+baking cookies
+bandaging
+barbequing
+bartending
+base jumping
+bathing dog
+battle rope training
+beatboxing
+bee keeping
+being excited
+being in zero gravity
+belly dancing
+bench pressing
+bending back
+bending metal
+biking through snow
+blasting sand
+blending fruit
+blowdrying hair
+blowing bubble gum
+blowing glass
+blowing leaves
+blowing nose
+blowing out candles
+bobsledding
+bodysurfing
+bookbinding
+bottling
+bouncing ball (not juggling)
+bouncing on bouncy castle
+bouncing on trampoline
+bowling
+braiding hair
+breading or breadcrumbing
+breakdancing
+breaking boards
+breaking glass
+breathing fire
+brush painting
+brushing floor
+brushing hair
+brushing teeth
+building cabinet
+building lego
+building sandcastle
+building shed
+bulldozing
+bungee jumping
+burping
+busking
+calculating
+calligraphy
+canoeing or kayaking
+capoeira
+capsizing
+card stacking
+card throwing
+carrying baby
+carrying weight
+cartwheeling
+carving ice
+carving marble
+carving pumpkin
+carving wood with a knife
+casting fishing line
+catching fish
+catching or throwing baseball
+catching or throwing frisbee
+catching or throwing softball
+celebrating
+changing gear in car
+changing oil
+changing wheel (not on bike)
+chasing
+checking tires
+checking watch
+cheerleading
+chewing gum
+chiseling stone
+chiseling wood
+chopping meat
+chopping wood
+clam digging
+clapping
+clay pottery making
+clean and jerk
+cleaning gutters
+cleaning pool
+cleaning shoes
+cleaning toilet
+cleaning windows
+climbing a rope
+climbing ladder
+climbing tree
+closing door
+coloring in
+combing hair
+contact juggling
+contorting
+cooking chicken
+cooking egg
+cooking on campfire
+cooking sausages (not on barbeque)
+cooking scallops
+cosplaying
+coughing
+counting money
+country line dancing
+cracking back
+cracking knuckles
+cracking neck
+crawling baby
+crocheting
+crossing eyes
+crossing river
+crying
+cumbia
+curling (sport)
+curling eyelashes
+curling hair
+cutting apple
+cutting cake
+cutting nails
+cutting orange
+cutting pineapple
+cutting watermelon
+dancing ballet
+dancing charleston
+dancing gangnam style
+dancing macarena
+deadlifting
+dealing cards
+decorating the christmas tree
+decoupage
+delivering mail
+digging
+dining
+directing traffic
+disc golfing
+diving cliff
+docking boat
+dodgeball
+doing aerobics
+doing jigsaw puzzle
+doing laundry
+doing nails
+doing sudoku
+drawing
+dribbling basketball
+drinking shots
+driving car
+driving tractor
+drooling
+drop kicking
+drumming fingers
+dumpster diving
+dunking basketball
+dyeing eyebrows
+dyeing hair
+eating burger
+eating cake
+eating carrots
+eating chips
+eating doughnuts
+eating hotdog
+eating ice cream
+eating nachos
+eating spaghetti
+eating watermelon
+egg hunting
+embroidering
+entering church
+exercising arm
+exercising with an exercise ball
+extinguishing fire
+faceplanting
+falling off bike
+falling off chair
+feeding birds
+feeding fish
+feeding goats
+fencing (sport)
+fidgeting
+filling cake
+filling eyebrows
+finger snapping
+fixing bicycle
+fixing hair
+flint knapping
+flipping bottle
+flipping pancake
+fly tying
+flying kite
+folding clothes
+folding napkins
+folding paper
+front raises
+frying vegetables
+gargling
+geocaching
+getting a haircut
+getting a piercing
+getting a tattoo
+giving or receiving award
+gold panning
+golf chipping
+golf driving
+golf putting
+gospel singing in church
+grinding meat
+grooming cat
+grooming dog
+grooming horse
+gymnastics tumbling
+hammer throw
+hand washing clothes
+head stand
+headbanging
+headbutting
+helmet diving
+herding cattle
+high fiving
+high jump
+high kick
+historical reenactment
+hitting baseball
+hockey stop
+holding snake
+home roasting coffee
+hopscotch
+hoverboarding
+huddling
+hugging (not baby)
+hugging baby
+hula hooping
+hurdling
+hurling (sport)
+ice climbing
+ice fishing
+ice skating
+ice swimming
+inflating balloons
+installing carpet
+ironing
+ironing hair
+javelin throw
+jaywalking
+jetskiing
+jogging
+juggling balls
+juggling fire
+juggling soccer ball
+jumping bicycle
+jumping into pool
+jumping jacks
+jumping sofa
+jumpstyle dancing
+karaoke
+kicking field goal
+kicking soccer ball
+kissing
+kitesurfing
+knitting
+krumping
+land sailing
+laughing
+lawn mower racing
+laying bricks
+laying concrete
+laying decking
+laying stone
+laying tiles
+leatherworking
+letting go of balloon
+licking
+lifting hat
+lighting candle
+lighting fire
+listening with headphones
+lock picking
+long jump
+longboarding
+looking at phone
+looking in mirror
+luge
+lunge
+making a cake
+making a sandwich
+making balloon shapes
+making bubbles
+making cheese
+making horseshoes
+making jewelry
+making latte art
+making paper aeroplanes
+making pizza
+making slime
+making snowman
+making sushi
+making tea
+making the bed
+marching
+marriage proposal
+massaging back
+massaging feet
+massaging legs
+massaging neck
+massaging person's head
+metal detecting
+milking cow
+milking goat
+mixing colours
+moon walking
+mopping floor
+mosh pit dancing
+motorcycling
+mountain climber (exercise)
+moving baby
+moving child
+moving furniture
+mowing lawn
+mushroom foraging
+needle felting
+news anchoring
+opening bottle (not wine)
+opening coconuts
+opening door
+opening present
+opening refrigerator
+opening wine bottle
+packing
+paragliding
+parasailing
+parkour
+passing American football (in game)
+passing American football (not in game)
+passing soccer ball
+peeling apples
+peeling banana
+peeling potatoes
+person collecting garbage
+petting animal (not cat)
+petting cat
+petting horse
+photobombing
+photocopying
+picking apples
+picking blueberries
+pillow fight
+pinching
+pirouetting
+planing wood
+planting trees
+plastering
+playing accordion
+playing american football
+playing badminton
+playing bagpipes
+playing basketball
+playing bass guitar
+playing beer pong
+playing billiards
+playing blackjack
+playing cards
+playing cello
+playing checkers
+playing chess
+playing clarinet
+playing controller
+playing cricket
+playing cymbals
+playing darts
+playing didgeridoo
+playing dominoes
+playing drums
+playing field hockey
+playing flute
+playing gong
+playing guitar
+playing hand clapping games
+playing harmonica
+playing harp
+playing ice hockey
+playing keyboard
+playing kickball
+playing laser tag
+playing lute
+playing mahjong
+playing maracas
+playing marbles
+playing monopoly
+playing netball
+playing nose flute
+playing oboe
+playing ocarina
+playing organ
+playing paintball
+playing pan pipes
+playing piano
+playing piccolo
+playing pinball
+playing ping pong
+playing poker
+playing polo
+playing recorder
+playing road hockey
+playing rounders
+playing rubiks cube
+playing saxophone
+playing scrabble
+playing shuffleboard
+playing slot machine
+playing squash or racquetball
+playing tennis
+playing trombone
+playing trumpet
+playing ukulele
+playing violin
+playing volleyball
+playing with trains
+playing xylophone
+poaching eggs
+poking bellybutton
+pole vault
+polishing furniture
+polishing metal
+popping balloons
+pouring beer
+pouring milk
+pouring wine
+preparing salad
+presenting weather forecast
+pretending to be a statue
+pull ups
+pulling espresso shot
+pulling rope (game)
+pumping fist
+pumping gas
+punching bag
+punching person (boxing)
+push up
+pushing car
+pushing cart
+pushing wheelbarrow
+pushing wheelchair
+putting in contact lenses
+putting on eyeliner
+putting on foundation
+putting on lipstick
+putting on mascara
+putting on sari
+putting on shoes
+putting wallpaper on wall
+raising eyebrows
+reading book
+reading newspaper
+recording music
+repairing puncture
+riding a bike
+riding camel
+riding elephant
+riding mechanical bull
+riding mule
+riding or walking with horse
+riding scooter
+riding snow blower
+riding unicycle
+ripping paper
+roasting marshmallows
+roasting pig
+robot dancing
+rock climbing
+rock scissors paper
+roller skating
+rolling eyes
+rolling pastry
+rope pushdown
+running on treadmill
+sailing
+salsa dancing
+saluting
+sanding floor
+sanding wood
+sausage making
+sawing wood
+scrambling eggs
+scrapbooking
+scrubbing face
+scuba diving
+seasoning food
+separating eggs
+setting table
+sewing
+shaking hands
+shaking head
+shaping bread dough
+sharpening knives
+sharpening pencil
+shaving head
+shaving legs
+shearing sheep
+shining flashlight
+shining shoes
+shoot dance
+shooting basketball
+shooting goal (soccer)
+shooting off fireworks
+shopping
+shot put
+shouting
+shoveling snow
+shredding paper
+shucking oysters
+shuffling cards
+shuffling feet
+side kick
+sieving
+sign language interpreting
+silent disco
+singing
+sipping cup
+situp
+skateboarding
+ski ballet
+ski jumping
+skiing crosscountry
+skiing mono
+skiing slalom
+skipping rope
+skipping stone
+skydiving
+slacklining
+slapping
+sled dog racing
+sleeping
+slicing onion
+smashing
+smelling feet
+smoking
+smoking hookah
+smoking pipe
+snatch weight lifting
+sneezing
+snorkeling
+snowboarding
+snowkiting
+snowmobiling
+somersaulting
+spelunking
+spinning plates
+spinning poi
+splashing water
+spray painting
+spraying
+springboard diving
+square dancing
+squat
+squeezing orange
+stacking cups
+stacking dice
+standing on hands
+staring
+steer roping
+steering car
+sticking tongue out
+stomping grapes
+stretching arm
+stretching leg
+sucking lolly
+surfing crowd
+surfing water
+surveying
+sweeping floor
+swimming backstroke
+swimming breast stroke
+swimming butterfly stroke
+swimming front crawl
+swimming with dolphins
+swimming with sharks
+swing dancing
+swinging baseball bat
+swinging on something
+sword fighting
+sword swallowing
+tackling
+tagging graffiti
+tai chi
+taking photo
+talking on cell phone
+tango dancing
+tap dancing
+tapping guitar
+tapping pen
+tasting beer
+tasting food
+tasting wine
+testifying
+texting
+threading needle
+throwing axe
+throwing ball (not baseball or American football)
+throwing discus
+throwing knife
+throwing snowballs
+throwing tantrum
+throwing water balloon
+tickling
+tie dying
+tightrope walking
+tiptoeing
+tobogganing
+tossing coin
+tossing salad
+training dog
+trapezing
+treating wood
+trimming or shaving beard
+trimming shrubs
+trimming trees
+triple jump
+twiddling fingers
+tying bow tie
+tying knot (not on a tie)
+tying necktie
+tying shoe laces
+unboxing
+uncorking champagne
+unloading truck
+using a microscope
+using a paint roller
+using a power drill
+using a sledge hammer
+using a wrench
+using atm
+using bagging machine
+using circular saw
+using inhaler
+using megaphone
+using puppets
+using remote controller (not gaming)
+using segway
+vacuuming car
+vacuuming floor
+visiting the zoo
+wading through mud
+wading through water
+waiting in line
+waking up
+walking on stilts
+walking the dog
+walking through snow
+walking with crutches
+washing dishes
+washing feet
+washing hair
+washing hands
+watching tv
+water skiing
+water sliding
+watering plants
+waving hand
+waxing armpits
+waxing back
+waxing chest
+waxing eyebrows
+waxing legs
+weaving basket
+weaving fabric
+welding
+whistling
+windsurfing
+winking
+wood burning (art)
+wrapping present
+wrestling
+writing
+yarn spinning
+yawning
+yoga
+zumba
diff --git a/tools/data/kinetics/preprocess_k400.sh b/tools/data/kinetics/preprocess_k400.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0a886511365d25186e13e1a9d5b3adbc0ca2e5d7
--- /dev/null
+++ b/tools/data/kinetics/preprocess_k400.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+set -x
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+cat $DOWNLOAD_DIR/OpenMMLab___Kinetics-400/raw/*.tar.gz*  | tar -xvz -C $(dirname $DATA_ROOT)
+mv $(dirname $DATA_ROOT)/Kinetics-400 $DATA_ROOT
diff --git a/tools/data/kinetics/preprocess_k600.sh b/tools/data/kinetics/preprocess_k600.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2af2318f269cd23520126076340a515ad23f6a86
--- /dev/null
+++ b/tools/data/kinetics/preprocess_k600.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+set -x
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+cat $DOWNLOAD_DIR/OpenMMLab___Kinetics600/raw/*.tar.gz*  | tar -xvz -C $(dirname $DATA_ROOT)
+mv $(dirname $DATA_ROOT)/Kinetics600 $DATA_ROOT
diff --git a/tools/data/kinetics/preprocess_k700.sh b/tools/data/kinetics/preprocess_k700.sh
new file mode 100644
index 0000000000000000000000000000000000000000..601ce75837b13c802b019d50e7cb2f21249ca7c0
--- /dev/null
+++ b/tools/data/kinetics/preprocess_k700.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+set -x
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+cat $DOWNLOAD_DIR/OpenMMLab___Kinetics_700/raw/*.tar.gz*  | tar -xvz -C $(dirname $DATA_ROOT)
+mv $(dirname $DATA_ROOT)/Kinetics_700 $DATA_ROOT
diff --git a/tools/data/kinetics/rename_classnames.sh b/tools/data/kinetics/rename_classnames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5a338ac0fe7f9b246d20111cf06f4df339e01551
--- /dev/null
+++ b/tools/data/kinetics/rename_classnames.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# Rename classname for convenience
+DATASET=$1
+if [ "$DATASET" == "kinetics400" ] || [ "$1" == "kinetics600" ] || [ "$1" == "kinetics700" ]; then
+        echo "We are processing $DATASET"
+else
+        echo "Bad Argument, we only support kinetics400, kinetics600 or kinetics700"
+        exit 0
+fi
+
+cd ../../../data/${DATASET}/
+ls ./videos_train | while read class; do \
+  newclass=`echo $class | tr " " "_" `;
+  if [ "${class}" != "${newclass}" ]
+  then
+    mv "videos_train/${class}" "videos_train/${newclass}";
+  fi
+done
+
+ls ./videos_val | while read class; do \
+  newclass=`echo $class | tr " " "_" `;
+  if [ "${class}" != "${newclass}" ]
+  then
+    mv "videos_val/${class}" "videos_val/${newclass}";
+  fi
+done
+
+cd ../../tools/data/kinetics/
diff --git a/tools/data/kinetics710/README.md b/tools/data/kinetics710/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7494f41dfd05b74a29cbe5929e344c259eecddd5
--- /dev/null
+++ b/tools/data/kinetics710/README.md
@@ -0,0 +1,91 @@
+# Preparing Kinetics-710
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@misc{li2022uniformerv2,
+      title={UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer},
+      author={Kunchang Li and Yali Wang and Yinan He and Yizhuo Li and Yi Wang and Limin Wang and Yu Qiao},
+      year={2022},
+      eprint={2211.09552},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+For basic dataset information, please refer to the [paper](https://arxiv.org/pdf/2211.09552.pdf). The scripts can be used for preparing kinetics-710. MMAction2 supports Kinetics-710
+dataset as a concat dataset, which means only provides a list of annotation files, and makes use of the original data of Kinetics-400/600/700 dataset. You could refer to the [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py)
+for details, which also provides a template config about how to use concat dataset in MMAction2.
+Before we start, please make sure that the directory is located at `$MMACTION2`.
+
+## Step 1. Download Kinetics 400/600/700
+
+Kinetics-710 is a video benchmark based on Kinetics-400/600/700, which merges the training set of these Kinetics datasets, and deletes the repeated videos according to Youtube IDs. MMAction2 provides an annotation file based on the Kinetics-400/600/700 on [OpenDataLab](https://opendatalab.com/). So we suggest you download Kinetics-400/600/700 first from OpenDataLab by [MIM](https://github.com/open-mmlab/mim).
+
+```shell
+# install OpenXlab CLI tools
+pip install -U openxlab
+# log in OpenXLab
+openxlab login
+# download Kinetics-400/600/700, note that this might take a long time.
+mim download mmaction2 --dataset kinetics400
+mim download mmaction2 --dataset kinetics600
+mim download mmaction2 --dataset kinetics700
+
+```
+
+## Step 2. Download Kinetics-710 Annotations
+
+We provide the annotation list of Kinetics-710 corresponding to OpenDataLab version Kinetics, you could download it from aliyun and unzip it to the `$MMACTION2/data/`
+
+```shell
+wget -P data https://download.openmmlab.com/mmaction/dataset/kinetics710/annotations.zip
+cd data && unzip annotations.zip && cd ..
+
+```
+
+## Step 3. Folder Structure
+
+After the whole data pipeline for Kinetics preparation.
+you can get the videos and annotation files for Kinetics-710.
+
+In the context of the whole project (for Kinetics only), the *minimal* folder structure will look like:
+(*minimal* means that some data are not necessary: for example, you may want to evaluate kinetics using the original video format.)
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── kinetics400
+│   │   ├── videos_train
+│   │   ├── videos_val
+│   │   │   ├── jf7RDuUTrsQ.mp4
+│   │   │   ├── ...
+│   ├── kinetics600
+│   │   ├── videos
+│   │   │   ├── vol_00
+│   │   │   │   ├── -A5JFdMXB_k_000018_000028.mp4
+│   │   │   │   ├── ...
+│   │   │   ├── ...
+│   │   │   ├── vol63
+│   ├── kinetics700
+│   │   ├── videos
+│   │   │   ├── vol_00
+│   │   │   │   ├── -Paa0R0tQ1w_000009_000019.mp4
+│   │   │   │   ├── ...
+│   │   │   ├── ...
+│   │   │   ├── vol63
+│   ├── kinetics710
+│   │   ├── k400_train_list_videos.txt
+│   │   ├── k400_val_list_videos.txt
+│   │   ├── k600_train_list_videos.txt
+│   │   ├── k600_val_list_videos.txt
+│   │   ├── k700_train_list_videos.txt
+│   │   ├── k700_val_list_videos.txt
+```
+
+For training and evaluating on Kinetics, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/kinetics710/README_zh-CN.md b/tools/data/kinetics710/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..17b817b0beea8c0eeeb79b4aea46c584fe96fb97
--- /dev/null
+++ b/tools/data/kinetics710/README_zh-CN.md
@@ -0,0 +1,89 @@
+# 准备 Kinetics-710
+
+## 介绍
+
+<!-- [DATASET] -->
+
+```BibTeX
+@misc{li2022uniformerv2,
+      title={UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer},
+      author={Kunchang Li and Yali Wang and Yinan He and Yizhuo Li and Yi Wang and Limin Wang and Yu Qiao},
+      year={2022},
+      eprint={2211.09552},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+关于基本数据集信息，请参考 [论文](https://arxiv.org/pdf/2211.09552.pdf)。这些脚本可以用于准备 kinetics-710。MMAction2 以 Concat Daataset 的形式支持了 Kinetics-710 数据集，我们只提供一个注释文件列表，并利用 Kinetics-400/600/700 数据集的原始数据。你可以参考 [配置](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py) 了解详情，它也提供了一个模板配置，说明了如何在 MMAction2 中使用 Concat Dataset。
+在我们开始之前，请确保目录位于 `$MMACTION2`。
+
+## 第一步：下载 Kinetics 400/600/700
+
+Kinetics-710 是基于 Kinetics-400/600/700 的视频数据集，它合并了这些 Kinetics 数据集的训练集，并根据 Youtube ID 删除了重复的视频。MMAction2 提供了一个基于 Kinetics-400/600/700 的 OpenDataLab 版本的标注文件，你可以通过 [MIM](https://github.com/open-mmlab/mim) 从 OpenDataLab 下载。
+
+```shell
+# 安装 OpenXLab CLI 工具
+pip install -U openxlab
+# 登录 OpenXLab
+openxlab login
+# 下载 Kinetics-400/600/700，注意这可能需要很长时间。
+mim download mmaction2 --dataset kinetics400
+mim download mmaction2 --dataset kinetics600
+mim download mmaction2 --dataset kinetics700
+
+```
+
+## 第二步：下载 Kinetics-710 标注文件
+
+我们提供了与 OpenDataLab 版本 Kinetics 相对应的 Kinetics-710 标注文件列表，你可以从阿里云下载它，并将其解压到 `$MMACTION2/data/`
+
+```shell
+wget -P data https://download.openmmlab.com/mmaction/dataset/kinetics710/annotations.zip
+cd data && unzip annotations.zip && cd ..
+
+```
+
+## 第三步：文件夹结构
+
+完成 Kinetics 准备的整个数据流程后。
+你可以得到 Kinetics-710 的视频和注释文件。
+
+在整个项目目录下（仅针对 Kinetics），*最小*的文件夹结构如下：
+（*最小*意味着一些数据是不必要的：例如，你可能想要使用原始视频格式评估 kinetics。）
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── kinetics400
+│   │   ├── videos_train
+│   │   ├── videos_val
+│   │   │   ├── jf7RDuUTrsQ.mp4
+│   │   │   ├── ...
+│   ├── kinetics600
+│   │   ├── videos
+│   │   │   ├── vol_00
+│   │   │   │   ├── -A5JFdMXB_k_000018_000028.mp4
+│   │   │   │   ├── ...
+│   │   │   ├── ...
+│   │   │   ├── vol63
+│   ├── kinetics700
+│   │   ├── videos
+│   │   │   ├── vol_00
+│   │   │   │   ├── -Paa0R0tQ1w_000009_000019.mp4
+│   │   │   │   ├── ...
+│   │   │   ├── ...
+│   │   │   ├── vol63
+│   ├── kinetics710
+│   │   ├── k400_train_list_videos.txt
+│   │   ├── k400_val_list_videos.txt
+│   │   ├── k600_train_list_videos.txt
+│   │   ├── k600_val_list_videos.txt
+│   │   ├── k700_train_list_videos.txt
+│   │   ├── k700_val_list_videos.txt
+```
+
+关于在 Kinetics 上进行训练和评估，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/kinetics710/label_map_k710.txt b/tools/data/kinetics710/label_map_k710.txt
new file mode 100644
index 0000000000000000000000000000000000000000..12834df684e8e8095835d351ab66f3964a5e4e6f
--- /dev/null
+++ b/tools/data/kinetics710/label_map_k710.txt
@@ -0,0 +1,710 @@
+abseiling
+air drumming
+answering questions
+applauding
+applying cream
+archery
+arm wrestling
+arranging flowers
+assembling computer
+auctioning
+baby waking up
+baking cookies
+inflating balloons
+bandaging
+barbequing
+bartending
+beatboxing
+bee keeping
+belly dancing
+bench pressing
+bending back
+bending metal
+biking through snow
+blasting sand
+blowing glass
+blowing leaves
+blowing nose
+blowing out candles
+bobsledding
+bookbinding
+bouncing on trampoline
+bowling
+braiding hair
+breading or breadcrumbing
+breakdancing
+brush painting
+brushing hair
+brushing teeth
+building cabinet
+building shed
+bungee jumping
+busking
+canoeing or kayaking
+capoeira
+carrying baby
+cartwheeling
+carving pumpkin
+catching fish
+catching or throwing baseball
+catching or throwing frisbee
+catching or throwing softball
+celebrating
+changing oil
+changing wheel (not on bike)
+checking tires
+cheerleading
+chopping wood
+clapping
+clay pottery making
+clean and jerk
+cleaning floor
+cleaning gutters
+cleaning pool
+cleaning shoes
+cleaning toilet
+cleaning windows
+climbing a rope
+climbing ladder
+climbing tree
+contact juggling
+cooking chicken
+cooking egg
+cooking on campfire
+cooking sausages (not on barbeque)
+counting money
+country line dancing
+cracking neck
+crawling baby
+crossing river
+crying
+curling hair
+cutting nails
+cutting pineapple
+cutting watermelon
+dancing ballet
+dancing charleston
+dancing gangnam style
+dancing macarena
+deadlifting
+decorating the christmas tree
+digging
+dining
+disc golfing
+diving cliff
+dodgeball
+doing aerobics
+doing laundry
+doing nails
+drawing
+dribbling basketball
+sipping cup
+drinking beer
+drinking shots
+driving car
+driving tractor
+drop kicking
+drumming fingers
+dunking basketball
+dyeing hair
+eating burger
+eating cake
+eating carrots
+eating chips
+eating doughnuts
+eating hotdog
+eating ice cream
+eating spaghetti
+eating watermelon
+egg hunting
+exercising arm
+exercising with an exercise ball
+extinguishing fire
+faceplanting
+feeding birds
+feeding fish
+feeding goats
+filling eyebrows
+finger snapping
+fixing hair
+flipping pancake
+flying kite
+folding clothes
+folding napkins
+folding paper
+front raises
+frying vegetables
+person collecting garbage
+gargling
+getting a haircut
+getting a tattoo
+giving or receiving award
+golf chipping
+golf driving
+golf putting
+grinding meat
+grooming dog
+grooming horse
+gymnastics tumbling
+hammer throw
+headbanging
+headbutting
+high jump
+high kick
+hitting baseball
+hockey stop
+holding snake
+hopscotch
+hoverboarding
+hugging (not baby)
+hula hooping
+hurdling
+hurling (sport)
+ice climbing
+ice fishing
+ice skating
+ironing
+javelin throw
+jetskiing
+jogging
+juggling balls
+juggling fire
+juggling soccer ball
+jumping into pool
+jumpstyle dancing
+kicking field goal
+kicking soccer ball
+kissing
+kitesurfing
+knitting
+krumping
+laughing
+laying bricks
+long jump
+lunge
+making a cake
+making a sandwich
+making the bed
+making jewelry
+making pizza
+making snowman
+making sushi
+making tea
+marching
+massaging back
+massaging feet
+massaging legs
+massaging person's head
+milking cow
+mopping floor
+motorcycling
+moving furniture
+mowing lawn
+news anchoring
+opening bottle (not wine)
+opening present
+paragliding
+parasailing
+parkour
+passing American football (in game)
+passing American football (not in game)
+peeling apples
+peeling potatoes
+petting animal (not cat)
+petting cat
+picking apples
+planting trees
+plastering
+playing accordion
+playing badminton
+playing bagpipes
+playing basketball
+playing bass guitar
+playing cards
+playing cello
+playing chess
+playing clarinet
+playing controller
+playing cricket
+playing cymbals
+playing didgeridoo
+playing drums
+playing flute
+playing guitar
+playing harmonica
+playing harp
+playing ice hockey
+playing keyboard
+playing kickball
+playing monopoly
+playing organ
+playing paintball
+playing piano
+playing poker
+playing recorder
+playing saxophone
+playing squash or racquetball
+playing tennis
+playing trombone
+playing trumpet
+playing ukulele
+playing violin
+playing volleyball
+playing xylophone
+pole vault
+presenting weather forecast
+pull ups
+pumping fist
+pumping gas
+punching bag
+punching person (boxing)
+push up
+pushing car
+pushing cart
+pushing wheelchair
+reading book
+reading newspaper
+recording music
+riding a bike
+riding camel
+riding elephant
+riding mechanical bull
+riding mountain bike
+riding mule
+riding or walking with horse
+riding scooter
+riding unicycle
+ripping paper
+robot dancing
+rock climbing
+rock scissors paper
+roller skating
+running on treadmill
+sailing
+salsa dancing
+sanding floor
+scrambling eggs
+scuba diving
+setting table
+shaking hands
+shaking head
+sharpening knives
+sharpening pencil
+shaving head
+shaving legs
+shearing sheep
+shining shoes
+shooting basketball
+shooting goal (soccer)
+shot put
+shoveling snow
+shredding paper
+shuffling cards
+side kick
+sign language interpreting
+singing
+situp
+skateboarding
+ski jumping
+skiing mono
+skiing crosscountry
+skiing slalom
+skipping rope
+skydiving
+slacklining
+slapping
+sled dog racing
+smoking
+smoking hookah
+snatch weight lifting
+sneezing
+smelling feet
+snorkeling
+snowboarding
+snowkiting
+snowmobiling
+somersaulting
+spinning poi
+spray painting
+spraying
+springboard diving
+squat
+sticking tongue out
+stomping grapes
+stretching arm
+stretching leg
+strumming guitar
+surfing crowd
+surfing water
+sweeping floor
+swimming backstroke
+swimming breast stroke
+swimming butterfly stroke
+swing dancing
+swinging legs
+swinging on something
+sword fighting
+tai chi
+taking a shower
+tango dancing
+tap dancing
+tapping guitar
+tapping pen
+tasting beer
+tasting food
+testifying
+texting
+throwing axe
+throwing ball
+throwing discus
+tickling
+tobogganing
+tossing coin
+tossing salad
+training dog
+trapezing
+trimming or shaving beard
+trimming trees
+triple jump
+tying bow tie
+tying knot (not on a tie)
+tying necktie
+unboxing
+unloading truck
+using computer
+using remote controller (not gaming)
+using segway
+vault
+waiting in line
+walking the dog
+washing dishes
+washing feet
+washing hair
+washing hands
+water skiing
+water sliding
+watering plants
+waxing back
+waxing chest
+waxing eyebrows
+waxing legs
+weaving basket
+welding
+whistling
+windsurfing
+wrapping present
+wrestling
+writing
+yawning
+yoga
+zumba
+poaching eggs
+playing nose flute
+entering church
+closing door
+helmet diving
+doing sudoku
+coughing
+seasoning food
+peeling banana
+eating nachos
+waxing armpits
+shouting
+silent disco
+polishing furniture
+taking photo
+dealing cards
+putting wallpaper on wall
+uncorking champagne
+curling eyelashes
+brushing floor
+pulling espresso shot
+playing american football
+grooming cat
+playing checkers
+moving child
+stacking cups
+squeezing orange
+opening coconuts
+rolling eyes
+picking blueberries
+playing road hockey
+carving wood with a knife
+slicing onion
+saluting
+letting go of balloon
+breaking glass
+carrying weight
+mixing colours
+moving baby
+blending fruit
+pouring milk
+surveying
+making slime
+sieving
+walking with crutches
+flipping bottle
+playing billiards
+arresting
+listening with headphones
+spinning plates
+carving marble
+cutting cake
+shoot dance
+being excited
+petting horse
+splashing water
+filling cake
+stacking dice
+checking watch
+treating wood
+laying decking
+shooting off fireworks
+pouring wine
+pretending to be a statue
+steering car
+playing rounders
+looking in mirror
+jumping sofa
+lighting candle
+walking on stilts
+crocheting
+playing piccolo
+vacuuming car
+high fiving
+playing shuffleboard
+chasing
+pulling rope (game)
+being in zero gravity
+sanding wood
+decoupage
+using megaphone
+making latte art
+ski ballet
+playing oboe
+bouncing ball (not juggling)
+playing mahjong
+herding cattle
+swimming with sharks
+milking goat
+swimming with dolphins
+metal detecting
+playing slot machine
+polishing metal
+throwing tantrum
+lawn mower racing
+laying stone
+cutting orange
+skipping stone
+pouring beer
+making bubbles
+jaywalking
+leatherworking
+card stacking
+putting on eyeliner
+card throwing
+chewing gum
+falling off bike
+repairing puncture
+dumpster diving
+tiptoeing
+sleeping
+using circular saw
+cracking knuckles
+pinching
+chiseling wood
+playing rubiks cube
+weaving fabric
+fencing (sport)
+sword swallowing
+lighting fire
+vacuuming floor
+combing hair
+building lego
+playing pinball
+fly tying
+playing lute
+opening door
+waving hand
+rolling pastry
+chiseling stone
+threading needle
+playing dominoes
+opening wine bottle
+playing with trains
+steer roping
+playing field hockey
+separating eggs
+sewing
+talking on cell phone
+needle felting
+pushing wheelbarrow
+using a paint roller
+playing netball
+lifting hat
+massaging neck
+blowing bubble gum
+walking through snow
+docking boat
+clam digging
+marriage proposal
+packing
+sausage making
+licking
+scrapbooking
+flint knapping
+lock picking
+putting on lipstick
+sawing wood
+playing hand clapping games
+geocaching
+looking at phone
+making cheese
+poking bellybutton
+contorting
+fixing bicycle
+using a microscope
+using a wrench
+doing jigsaw puzzle
+making horseshoes
+cooking scallops
+square dancing
+getting a piercing
+playing ocarina
+making paper aeroplanes
+playing scrabble
+visiting the zoo
+crossing eyes
+jumping bicycle
+throwing water balloon
+bodysurfing
+pirouetting
+luge
+spelunking
+watching tv
+attending conference
+curling (sport)
+directing traffic
+swimming front crawl
+ice swimming
+battle rope training
+putting on mascara
+bouncing on bouncy castle
+smoking pipe
+pillow fight
+putting on sari
+calligraphy
+roasting pig
+cracking back
+shopping
+burping
+using bagging machine
+staring
+shucking oysters
+blowdrying hair
+smashing
+playing laser tag
+wading through mud
+rope pushdown
+preparing salad
+making balloon shapes
+tagging graffiti
+adjusting glasses
+using a power drill
+trimming shrubs
+popping balloons
+playing pan pipes
+using puppets
+arguing
+backflip (human)
+riding snow blower
+hand washing clothes
+calculating
+gospel singing in church
+standing on hands
+tasting wine
+shaping bread dough
+wading through water
+falling off chair
+throwing snowballs
+building sandcastle
+land sailing
+tying shoe laces
+jumping jacks
+wood burning (art)
+putting on foundation
+putting on shoes
+cumbia
+archaeological excavation
+mountain climber (exercise)
+assembling bicycle
+head stand
+cutting apple
+shuffling feet
+bottling
+breathing fire
+using inhaler
+historical reenactment
+hugging baby
+mushroom foraging
+delivering mail
+laying tiles
+using atm
+chopping meat
+tightrope walking
+mosh pit dancing
+photobombing
+coloring in
+huddling
+playing gong
+laying concrete
+breaking boards
+acting in play
+base jumping
+tie dying
+using a sledge hammer
+playing ping pong
+photocopying
+winking
+waking up
+swinging baseball bat
+twiddling fingers
+playing polo
+longboarding
+ironing hair
+bathing dog
+moon walking
+playing marbles
+embroidering
+playing beer pong
+home roasting coffee
+gold panning
+karaoke
+changing gear in car
+raising eyebrows
+yarn spinning
+scrubbing face
+fidgeting
+planing wood
+cosplaying
+capsizing
+tackling
+shining flashlight
+dyeing eyebrows
+drooling
+alligator wrestling
+playing blackjack
+carving ice
+playing maracas
+opening refrigerator
+throwing knife
+putting in contact lenses
+passing soccer ball
+casting fishing line
+sucking lolly
+installing carpet
+bulldozing
+roasting marshmallows
+playing darts
+chopping vegetables
+bull fighting
diff --git a/tools/data/mit/README.md b/tools/data/mit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..61dd24ba5d24dfb63f61028c3274d40f7e5a65ef
--- /dev/null
+++ b/tools/data/mit/README.md
@@ -0,0 +1,128 @@
+# Preparing Moments in Time
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{monfortmoments,
+    title={Moments in Time Dataset: one million videos for event understanding},
+    author={Monfort, Mathew and Andonian, Alex and Zhou, Bolei and Ramakrishnan, Kandan and Bargal, Sarah Adel and Yan, Tom and Brown, Lisa and Fan, Quanfu and Gutfruend, Dan and Vondrick, Carl and others},
+    journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+    year={2019},
+    issn={0162-8828},
+    pages={1--8},
+    numpages={8},
+    doi={10.1109/TPAMI.2019.2901464},
+}
+```
+
+For basic dataset information, you can refer to the dataset [website](http://moments.csail.mit.edu/).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/mit/`.
+
+## Step 1. Prepare Annotations and Videos
+
+First of all, you have to visit the official [website](http://moments.csail.mit.edu/), fill in an application form for downloading the dataset. Then you will get the download link. You can use `bash preprocess_data.sh` to prepare annotations and videos. However, the download command is missing in that script. Remember to download the dataset to the proper place follow the comment in this script.
+
+For better decoding speed, you can resize the original videos into smaller sized, densely encoded version by:
+
+```shell
+python ../resize_videos.py ../../../data/mit/videos/ ../../../data/mit/videos_256p_dense_cache --dense --level 2
+```
+
+## Step 2. Extract RGB and Flow
+
+This part is **optional** if you only want to use the video loader.
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. And you can run the following script to soft link the extracted frames.
+
+```shell
+# execute these two line (Assume the SSD is mounted at "/mnt/SSD/")
+mkdir /mnt/SSD/mit_extracted/
+ln -s /mnt/SSD/mit_extracted/ ../../../data/mit/rawframes
+```
+
+If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow.
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images.
+
+```shell
+bash extract_rgb_frames_opencv.sh
+```
+
+If both are required, run the following script to extract frames.
+
+```shell
+bash extract_frames.sh
+```
+
+## Step 4. Generate File List
+
+you can run the follow script to generate file list in the format of rawframes and videos.
+
+```shell
+bash generate_{rawframes, videos}_filelist.sh
+```
+
+## Step 5. Check Directory Structure
+
+After the whole data process for Moments in Time preparation,
+you will get the rawframes (RGB + Flow), videos and annotation files for Moments in Time.
+
+In the context of the whole project (for Moments in Time only), the folder structure will look like:
+
+```
+mmaction2
+├── data
+│   └── mit
+│       ├── annotations
+│       │   ├── license.txt
+│       │   ├── moments_categories.txt
+│       │   ├── README.txt
+│       │   ├── trainingSet.csv
+│       │   └── validationSet.csv
+│       ├── mit_train_rawframe_anno.txt
+│       ├── mit_train_video_anno.txt
+│       ├── mit_val_rawframe_anno.txt
+│       ├── mit_val_video_anno.txt
+│       ├── rawframes
+│       │   ├── training
+│       │   │   ├── adult+female+singing
+│       │   │   │   ├── 0P3XG_vf91c_35
+│       │   │   │   │   ├── flow_x_00001.jpg
+│       │   │   │   │   ├── flow_x_00002.jpg
+│       │   │   │   │   ├── ...
+│       │   │   │   │   ├── flow_y_00001.jpg
+│       │   │   │   │   ├── flow_y_00002.jpg
+│       │   │   │   │   ├── ...
+│       │   │   │   │   ├── img_00001.jpg
+│       │   │   │   │   └── img_00002.jpg
+│       │   │   │   └── yt-zxQfALnTdfc_56
+│       │   │   │   │   ├── ...
+│       │   │   └── yawning
+│       │   │       ├── _8zmP1e-EjU_2
+│       │   │       │   ├── ...
+│       │   └── validation
+│       │   │       ├── ...
+│       └── videos
+│           ├── training
+│           │   ├── adult+female+singing
+│           │   │   ├── 0P3XG_vf91c_35.mp4
+│           │   │   ├── ...
+│           │   │   └── yt-zxQfALnTdfc_56.mp4
+│           │   └── yawning
+│           │       ├── ...
+│           └── validation
+│           │   ├── ...
+└── mmaction
+└── ...
+
+```
+
+For training and evaluating on Moments in Time, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/mit/README_zh-CN.md b/tools/data/mit/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..4761c49559b3bd4316135930d9324d0d8bcbc126
--- /dev/null
+++ b/tools/data/mit/README_zh-CN.md
@@ -0,0 +1,130 @@
+# 准备 Moments in Time
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{monfortmoments,
+    title={Moments in Time Dataset: one million videos for event understanding},
+    author={Monfort, Mathew and Andonian, Alex and Zhou, Bolei and Ramakrishnan, Kandan and Bargal, Sarah Adel and Yan, Tom and Brown, Lisa and Fan, Quanfu and Gutfruend, Dan and Vondrick, Carl and others},
+    journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+    year={2019},
+    issn={0162-8828},
+    pages={1--8},
+    numpages={8},
+    doi={10.1109/TPAMI.2019.2901464},
+}
+```
+
+用户可以参照数据集 [官网](http://moments.csail.mit.edu/)，获取数据集相关的基本信息。
+在准备数据集前，请确保命令行当前路径为 `$MMACTION2/tools/data/mit/`。
+
+## 步骤 1. 准备标注文件和视频文件
+
+首先，用户需要访问[官网](http://moments.csail.mit.edu/)，填写申请表来下载数据集。
+在得到下载链接后，用户可以使用 `bash preprocess_data.sh` 来准备标注文件和视频。
+请注意此脚本并没有下载标注和视频文件，用户需要根据脚本文件中的注释，提前下载好数据集，并放/软链接到合适的位置。
+
+为加快视频解码速度，用户需要缩小原视频的尺寸，可使用以下命令获取密集编码版视频：
+
+```shell
+python ../resize_videos.py ../../../data/mit/videos/ ../../../data/mit/videos_256p_dense_cache --dense --level 2
+```
+
+## Step 2. 抽取帧和光流
+
+如果用户只想使用视频加载训练，则该部分是 **可选项**。
+
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+如果用户有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 上。
+用户可使用以下命令为 SSD 建立软链接。
+
+```shell
+# 执行这两行指令进行抽取（假设 SSD 挂载在 "/mnt/SSD/"上）
+mkdir /mnt/SSD/mit_extracted/
+ln -s /mnt/SSD/mit_extracted/ ../../../data/mit/rawframes
+```
+
+如果用户需要抽取 RGB 帧（因为抽取光流的过程十分耗时），可以考虑运行以下命令使用 denseflow **只抽取 RGB 帧**。
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+如果用户没有安装 denseflow，则可以运行以下命令使用 OpenCV 抽取 RGB 帧。然而，该方法只能抽取与原始视频分辨率相同的帧。
+
+```shell
+bash extract_rgb_frames_opencv.sh
+```
+
+如果用户想抽取 RGB 帧和光流，则可以运行以下脚本进行抽取。
+
+```shell
+bash extract_frames.sh
+```
+
+## 步骤 3. 生成文件列表
+
+用户可以通过运行以下命令生成帧和视频格式的文件列表。
+
+```shell
+bash generate_{rawframes, videos}_filelist.sh
+```
+
+## 步骤 4. 检查目录结构
+
+在完成 Moments in Time 数据集准备流程后，用户可以得到 Moments in Time 的 RGB 帧 + 光流文件，视频文件以及标注文件。
+
+在整个 MMAction2 文件夹下，Moments in Time 的文件结构如下：
+
+```
+mmaction2
+├── data
+│   └── mit
+│       ├── annotations
+│       │   ├── license.txt
+│       │   ├── moments_categories.txt
+│       │   ├── README.txt
+│       │   ├── trainingSet.csv
+│       │   └── validationSet.csv
+│       ├── mit_train_rawframe_anno.txt
+│       ├── mit_train_video_anno.txt
+│       ├── mit_val_rawframe_anno.txt
+│       ├── mit_val_video_anno.txt
+│       ├── rawframes
+│       │   ├── training
+│       │   │   ├── adult+female+singing
+│       │   │   │   ├── 0P3XG_vf91c_35
+│       │   │   │   │   ├── flow_x_00001.jpg
+│       │   │   │   │   ├── flow_x_00002.jpg
+│       │   │   │   │   ├── ...
+│       │   │   │   │   ├── flow_y_00001.jpg
+│       │   │   │   │   ├── flow_y_00002.jpg
+│       │   │   │   │   ├── ...
+│       │   │   │   │   ├── img_00001.jpg
+│       │   │   │   │   └── img_00002.jpg
+│       │   │   │   └── yt-zxQfALnTdfc_56
+│       │   │   │   │   ├── ...
+│       │   │   └── yawning
+│       │   │       ├── _8zmP1e-EjU_2
+│       │   │       │   ├── ...
+│       │   └── validation
+│       │   │       ├── ...
+│       └── videos
+│           ├── training
+│           │   ├── adult+female+singing
+│           │   │   ├── 0P3XG_vf91c_35.mp4
+│           │   │   ├── ...
+│           │   │   └── yt-zxQfALnTdfc_56.mp4
+│           │   └── yawning
+│           │       ├── ...
+│           └── validation
+│           │   ├── ...
+└── mmaction
+└── ...
+
+```
+
+关于对 Moments in Times 进行训练和验证，可以参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/mit/extract_frames.sh b/tools/data/mit/extract_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0d5f76347feea8215193f7f197b719161e6133b1
--- /dev/null
+++ b/tools/data/mit/extract_frames.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/mit/videos/training ../../data/mit/rawframes/training/ --level 2 --flow-type tvl1 --ext mp4 --task both
+echo "Raw frames (RGB and tv-l1) Generated for train set"
+
+python build_rawframes.py ../../data/mit/vides/validation/ ../../data/mit/rawframes/validation/ --level 2 --flow-type tvl1 --ext mp4 --task both
+echo "Raw frames (RGB and tv-l1) Generated for val set"
+
+cd mit/
diff --git a/tools/data/mit/extract_rgb_frames.sh b/tools/data/mit/extract_rgb_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a043d7d081f7774a8b2e97fee1846fc1743e6f02
--- /dev/null
+++ b/tools/data/mit/extract_rgb_frames.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/mit/videos/training ../../data/mit/rawframes/training/ --level 2 --ext mp4 --task rgb
+echo "Raw frames (RGB only) generated for train set"
+
+python build_rawframes.py ../../data/mit/videos/validation ../../data/mit/rawframes/validation/ --level 2 --ext mp4 --task rgb
+echo "Raw frames (RGB only) generated for val set"
+
+cd mit/
diff --git a/tools/data/mit/extract_rgb_frames_opencv.sh b/tools/data/mit/extract_rgb_frames_opencv.sh
new file mode 100644
index 0000000000000000000000000000000000000000..487952945191b9ddbef59570f83659e1ec180951
--- /dev/null
+++ b/tools/data/mit/extract_rgb_frames_opencv.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/mit/videos/training ../../data/mit/rawframes/training/ --level 2 --ext mp4 --task rgb --use-opencv
+echo "Raw frames (RGB only) generated for train set"
+
+python build_rawframes.py ../../data/mit/videos/validation ../../data/mit/rawframes/validation/ --level 2 --ext mp4 --task rgb --use-opencv
+echo "Raw frames (RGB only) generated for val set"
+
+cd mit/
diff --git a/tools/data/mit/generate_rawframes_filelist.sh b/tools/data/mit/generate_rawframes_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9f24a5338fcea348b15ad7fa85900105dffb5262
--- /dev/null
+++ b/tools/data/mit/generate_rawframes_filelist.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py mit data/mit/rawframes/training/ --level 2 --format rawframes --num-split 1 --subset train --shuffle
+echo "Train filelist for rawframes generated."
+
+PYTHONPATH=. python tools/data/build_file_list.py mit data/mit/rawframes/validation/ --level 2 --format rawframes --num-split 1 --subset val --shuffle
+echo "Val filelist for rawframes generated."
+cd tools/data/mit/
diff --git a/tools/data/mit/generate_videos_filelist.sh b/tools/data/mit/generate_videos_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..705aa144e52a84064e359a5e48b4764a397aef84
--- /dev/null
+++ b/tools/data/mit/generate_videos_filelist.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py mit data/mit/videos/training/ --level 2 --format videos --num-split 1 --subset train --shuffle
+echo "Train filelist for videos generated."
+
+PYTHONPATH=. python tools/data/build_file_list.py mit data/mit/videos/validation/ --level 2 --format videos --num-split 1 --subset val --shuffle
+echo "Val filelist for videos generated."
+cd tools/data/mit/
diff --git a/tools/data/mit/label_map.txt b/tools/data/mit/label_map.txt
new file mode 100644
index 0000000000000000000000000000000000000000..898789370cdc65f0ee92926ce9df213b5ff60bee
--- /dev/null
+++ b/tools/data/mit/label_map.txt
@@ -0,0 +1,339 @@
+clapping
+praying
+dropping
+burying
+covering
+flooding
+leaping
+drinking
+slapping
+cuddling
+sleeping
+preaching
+raining
+stitching
+spraying
+twisting
+coaching
+submerging
+breaking
+tuning
+boarding
+running
+destroying
+competing
+giggling
+shoveling
+chasing
+flicking
+pouring
+buttoning
+hammering
+carrying
+surfing
+pulling
+squatting
+aiming
+crouching
+tapping
+skipping
+washing
+winking
+queuing
+locking
+stopping
+sneezing
+flipping
+sewing
+clipping
+working
+rocking
+asking
+playing+fun
+camping
+plugging
+pedaling
+constructing
+slipping
+sweeping
+screwing
+shrugging
+hitchhiking
+cracking
+scratching
+trimming
+selling
+marching
+stirring
+kissing
+jumping
+starting
+clinging
+socializing
+picking
+splashing
+licking
+kicking
+sliding
+filming
+driving
+handwriting
+steering
+filling
+crashing
+stealing
+pressing
+shouting
+hiking
+vacuuming
+pointing
+giving
+diving
+hugging
+building
+swerving
+dining
+floating
+cheerleading
+leaning
+sailing
+singing
+playing
+hitting
+bubbling
+joining
+bathing
+raising
+sitting
+drawing
+protesting
+rinsing
+coughing
+smashing
+slicing
+balancing
+rafting
+kneeling
+dunking
+brushing
+crushing
+rubbing
+punting
+watering
+playing+music
+removing
+tearing
+imitating
+teaching
+cooking
+reaching
+studying
+serving
+bulldozing
+shaking
+discussing
+dragging
+gardening
+performing
+officiating
+photographing
+sowing
+dripping
+writing
+clawing
+bending
+boxing
+mopping
+gripping
+flowing
+digging
+tripping
+cheering
+buying
+bicycling
+feeding
+emptying
+unpacking
+sketching
+standing
+weeding
+stacking
+drying
+crying
+spinning
+frying
+cutting
+paying
+eating
+lecturing
+dancing
+adult+female+speaking
+boiling
+peeling
+wrapping
+wetting
+attacking
+welding
+putting
+swinging
+carving
+walking
+dressing
+inflating
+climbing
+shredding
+reading
+sanding
+frowning
+closing
+hunting
+clearing
+launching
+packaging
+fishing
+spilling
+leaking
+knitting
+boating
+sprinkling
+baptizing
+playing+sports
+rolling
+spitting
+dipping
+riding
+chopping
+extinguishing
+applauding
+calling
+talking
+adult+male+speaking
+snowing
+shaving
+marrying
+rising
+laughing
+crawling
+flying
+assembling
+injecting
+landing
+operating
+packing
+descending
+falling
+entering
+pushing
+sawing
+smelling
+overflowing
+fighting
+waking
+barbecuing
+skating
+painting
+drilling
+punching
+tying
+manicuring
+plunging
+grilling
+pitching
+towing
+telephoning
+crafting
+knocking
+playing+videogames
+storming
+placing
+turning
+barking
+child+singing
+opening
+waxing
+juggling
+mowing
+shooting
+sniffing
+interviewing
+stomping
+chewing
+arresting
+grooming
+rowing
+bowing
+gambling
+saluting
+fueling
+autographing
+throwing
+drenching
+waving
+signing
+repairing
+baking
+smoking
+skiing
+drumming
+child+speaking
+blowing
+cleaning
+combing
+spreading
+racing
+combusting
+adult+female+singing
+fencing
+swimming
+adult+male+singing
+snuggling
+shopping
+bouncing
+dusting
+stroking
+snapping
+biting
+roaring
+guarding
+unloading
+lifting
+instructing
+folding
+measuring
+whistling
+exiting
+stretching
+taping
+squinting
+catching
+draining
+massaging
+scrubbing
+handcuffing
+celebrating
+jogging
+colliding
+bowling
+resting
+blocking
+smiling
+tattooing
+erupting
+howling
+parading
+grinning
+sprinting
+hanging
+planting
+speaking
+ascending
+yawning
+cramming
+burning
+wrestling
+poking
+tickling
+exercising
+loading
+piloting
+typing
diff --git a/tools/data/mit/preprocess_data.sh b/tools/data/mit/preprocess_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5e5bd8ec1e697d38e1127f7a4c9c3ea0d18eb8b2
--- /dev/null
+++ b/tools/data/mit/preprocess_data.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+DATA_DIR="../../../data/mit/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+cd ${DATA_DIR}
+
+# Download the Moments_in_Time_Raw.zip here manually
+unzip Moments_in_Time_Raw.zip
+rm Moments_in_Time_Raw.zip
+
+if [ ! -d "./videos" ]; then
+  mkdir ./videos
+fi
+mv ./training ./videos && mv ./validation ./video
+
+if [ ! -d "./annotations" ]; then
+  mkdir ./annotations
+fi
+
+mv *.txt annotations && mv *.csv annotations
+
+cd "../../tools/data/mit"
diff --git a/tools/data/mmit/README.md b/tools/data/mmit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6b48614ea47fb8a4ed524ad92bfe3465019a60dc
--- /dev/null
+++ b/tools/data/mmit/README.md
@@ -0,0 +1,113 @@
+# Preparing Multi-Moments in Time
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@misc{monfort2019multimoments,
+    title={Multi-Moments in Time: Learning and Interpreting Models for Multi-Action Video Understanding},
+    author={Mathew Monfort and Kandan Ramakrishnan and Alex Andonian and Barry A McNamara and Alex Lascelles, Bowen Pan, Quanfu Fan, Dan Gutfreund, Rogerio Feris, Aude Oliva},
+    year={2019},
+    eprint={1911.00232},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
+
+For basic dataset information, you can refer to the dataset [website](http://moments.csail.mit.edu).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/mmit/`.
+
+## Step 1. Prepare Annotations and Videos
+
+First of all, you have to visit the official [website](http://moments.csail.mit.edu/), fill in an application form for downloading the dataset. Then you will get the download link. You can use `bash preprocess_data.sh` to prepare annotations and videos. However, the download command is missing in that script. Remember to download the dataset to the proper place follow the comment in this script.
+
+For better decoding speed, you can resize the original videos into smaller sized, densely encoded version by:
+
+```
+python ../resize_videos.py ../../../data/mmit/videos/ ../../../data/mmit/videos_256p_dense_cache --dense --level 2
+```
+
+## Step 2. Extract RGB and Flow
+
+This part is **optional** if you only want to use the video loader.
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+First, you can run the following script to soft link SSD.
+
+```shell
+# execute these two line (Assume the SSD is mounted at "/mnt/SSD/")
+mkdir /mnt/SSD/mmit_extracted/
+ln -s /mnt/SSD/mmit_extracted/ ../../../data/mmit/rawframes
+```
+
+If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow.
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images.
+
+```shell
+bash extract_rgb_frames_opencv.sh
+```
+
+If both are required, run the following script to extract frames using "tvl1" algorithm.
+
+```shell
+bash extract_frames.sh
+```
+
+## Step 3. Generate File List
+
+you can run the follow script to generate file list in the format of rawframes or videos.
+
+```shell
+bash generate_rawframes_filelist.sh
+bash generate_videos_filelist.sh
+```
+
+## Step 4. Check Directory Structure
+
+After the whole data process for Multi-Moments in Time preparation,
+you will get the rawframes (RGB + Flow), videos and annotation files for Multi-Moments in Time.
+
+In the context of the whole project (for Multi-Moments in Time only), the folder structure will look like:
+
+```
+mmaction2/
+└── data
+    └── mmit
+        ├── annotations
+        │   ├── moments_categories.txt
+        │   ├── trainingSet.txt
+        │   └── validationSet.txt
+        ├── mmit_train_rawframes.txt
+        ├── mmit_train_videos.txt
+        ├── mmit_val_rawframes.txt
+        ├── mmit_val_videos.txt
+        ├── rawframes
+        │   ├── 0-3-6-2-9-1-2-6-14603629126_5
+        │   │   ├── flow_x_00001.jpg
+        │   │   ├── flow_x_00002.jpg
+        │   │   ├── ...
+        │   │   ├── flow_y_00001.jpg
+        │   │   ├── flow_y_00002.jpg
+        │   │   ├── ...
+        │   │   ├── img_00001.jpg
+        │   │   └── img_00002.jpg
+        │   │   ├── ...
+        │   └── yt-zxQfALnTdfc_56
+        │   │   ├── ...
+        │   └── ...
+
+        └── videos
+            └── adult+female+singing
+                ├── 0-3-6-2-9-1-2-6-14603629126_5.mp4
+                └── yt-zxQfALnTdfc_56.mp4
+            └── ...
+```
+
+For training and evaluating on Multi-Moments in Time, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/mmit/README_zh-CN.md b/tools/data/mmit/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..5b90efdb0b7f20087c2378c9cd5e3a15beb30d7d
--- /dev/null
+++ b/tools/data/mmit/README_zh-CN.md
@@ -0,0 +1,115 @@
+# 准备 Multi-Moments in Time
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@misc{monfort2019multimoments,
+    title={Multi-Moments in Time: Learning and Interpreting Models for Multi-Action Video Understanding},
+    author={Mathew Monfort and Kandan Ramakrishnan and Alex Andonian and Barry A McNamara and Alex Lascelles, Bowen Pan, Quanfu Fan, Dan Gutfreund, Rogerio Feris, Aude Oliva},
+    year={2019},
+    eprint={1911.00232},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
+
+用户可以参照数据集 [官网](http://moments.csail.mit.edu/)，获取数据集相关的基本信息。
+在准备数据集前，请确保命令行当前路径为 `$MMACTION2/tools/data/mmit/`。
+
+## 步骤 1. Prepare Annotations and Videos
+
+首先，用户需要访问[官网](http://moments.csail.mit.edu/)，填写申请表来下载数据集。
+在得到下载链接后，用户可以使用 `bash preprocess_data.sh` 来准备标注文件和视频。
+请注意此脚本并没有下载标注和视频文件，用户需要根据脚本文件中的注释，提前下载好数据集，并放/软链接到合适的位置。
+
+为加快视频解码速度，用户需要缩小原视频的尺寸，可使用以下命令获取密集编码版视频：
+
+```
+python ../resize_videos.py ../../../data/mmit/videos/ ../../../data/mmit/videos_256p_dense_cache --dense --level 2
+```
+
+## Step 2. 抽取帧和光流
+
+如果用户只想使用视频加载训练，则该部分是 **可选项**。
+
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+如果用户有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 上。
+用户可使用以下命令为 SSD 建立软链接。
+
+```shell
+# 执行这两行指令进行抽取（假设 SSD 挂载在 "/mnt/SSD/"上）
+mkdir /mnt/SSD/mmit_extracted/
+ln -s /mnt/SSD/mmit_extracted/ ../../../data/mmit/rawframes
+```
+
+如果用户需要抽取 RGB 帧（因为抽取光流的过程十分耗时），可以考虑运行以下命令使用 denseflow **只抽取 RGB 帧**。
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+如果用户没有安装 denseflow，则可以运行以下命令使用 OpenCV 抽取 RGB 帧。然而，该方法只能抽取与原始视频分辨率相同的帧。
+
+```shell
+bash extract_rgb_frames_opencv.sh
+```
+
+如果用户想抽取 RGB 帧和光流，则可以运行以下脚本进行抽取。
+
+```shell
+bash extract_frames.sh
+```
+
+## 步骤 3. 生成文件列表
+
+用户可以通过运行以下命令生成帧和视频格式的文件列表。
+
+```shell
+bash generate_rawframes_filelist.sh
+bash generate_videos_filelist.sh
+```
+
+## 步骤 4. 检查目录结构
+
+在完成 Multi-Moments in Time 数据集准备流程后，用户可以得到 Multi-Moments in Time 的 RGB 帧 + 光流文件，视频文件以及标注文件。
+
+在整个 MMAction2 文件夹下，Multi-Moments in Time 的文件结构如下：
+
+```
+mmaction2/
+└── data
+    └── mmit
+        ├── annotations
+        │   ├── moments_categories.txt
+        │   ├── trainingSet.txt
+        │   └── validationSet.txt
+        ├── mmit_train_rawframes.txt
+        ├── mmit_train_videos.txt
+        ├── mmit_val_rawframes.txt
+        ├── mmit_val_videos.txt
+        ├── rawframes
+        │   ├── 0-3-6-2-9-1-2-6-14603629126_5
+        │   │   ├── flow_x_00001.jpg
+        │   │   ├── flow_x_00002.jpg
+        │   │   ├── ...
+        │   │   ├── flow_y_00001.jpg
+        │   │   ├── flow_y_00002.jpg
+        │   │   ├── ...
+        │   │   ├── img_00001.jpg
+        │   │   └── img_00002.jpg
+        │   │   ├── ...
+        │   └── yt-zxQfALnTdfc_56
+        │   │   ├── ...
+        │   └── ...
+
+        └── videos
+            └── adult+female+singing
+                ├── 0-3-6-2-9-1-2-6-14603629126_5.mp4
+                └── yt-zxQfALnTdfc_56.mp4
+            └── ...
+```
+
+关于对 Multi-Moments in Time 进行训练和验证，可以参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/mmit/extract_frames.sh b/tools/data/mmit/extract_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..259c46baecb78a406062fd18781fde0059ecf5ed
--- /dev/null
+++ b/tools/data/mmit/extract_frames.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/mmit/videos/ ../../../data/mmit/rawframes/ --task both --level 2 --flow-type tvl1 --ext mp4
+echo "Raw frames (RGB and Flow) Generated"
+cd mmit/
diff --git a/tools/data/mmit/extract_rgb_frames.sh b/tools/data/mmit/extract_rgb_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..571adb8817004e38e23cac5478d9f7a0b68641e0
--- /dev/null
+++ b/tools/data/mmit/extract_rgb_frames.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/mmit/videos/ ../../data/mmit/rawframes/ --task rgb --level 2 --ext mp4
+
+echo "Genearte raw frames (RGB only)"
+
+cd mmit/
diff --git a/tools/data/mmit/extract_rgb_frames_opencv.sh b/tools/data/mmit/extract_rgb_frames_opencv.sh
new file mode 100644
index 0000000000000000000000000000000000000000..835292718c65e40f55297452551eb5605cbbbab5
--- /dev/null
+++ b/tools/data/mmit/extract_rgb_frames_opencv.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/mmit/videos/ ../../data/mmit/rawframes/ --task rgb --level 2 --ext mp4 --use-opencv
+
+echo "Genearte raw frames (RGB only)"
+
+cd mmit/
diff --git a/tools/data/mmit/generate_rawframes_filelist.sh b/tools/data/mmit/generate_rawframes_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..aaed71bb082376c26311c23120827c7fba9b811d
--- /dev/null
+++ b/tools/data/mmit/generate_rawframes_filelist.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py mmit data/mmit/rawframes/ --level 2 --format rawframes --num-split 1 --subset train --shuffle
+echo "Train filelist for rawframes generated."
+
+PYTHONPATH=. python tools/data/build_file_list.py mmit data/mmit/rawframes/ --level 2 --format rawframes --num-split 1 --subset val --shuffle
+echo "Val filelist for rawframes generated."
+cd tools/data/mmit/
diff --git a/tools/data/mmit/generate_videos_filelist.sh b/tools/data/mmit/generate_videos_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..49460f4c5b436c1fb774f6917b747fcb045479d1
--- /dev/null
+++ b/tools/data/mmit/generate_videos_filelist.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py mmit data/mmit/videos/ --level 2 --format videos --num-split 1 --subset train --shuffle
+echo "Train filelist for videos generated."
+
+PYTHONPATH=. python tools/data/build_file_list.py mmit data/mmit/videos/ --level 2 --format videos --num-split 1 --subset val --shuffle
+echo "Val filelist for videos generated."
+cd tools/data/mmit/
diff --git a/tools/data/mmit/label_map.txt b/tools/data/mmit/label_map.txt
new file mode 100644
index 0000000000000000000000000000000000000000..756feba2e004da2f343f20d1ba85fb2062dd9000
--- /dev/null
+++ b/tools/data/mmit/label_map.txt
@@ -0,0 +1,313 @@
+crafting
+paddling
+raining
+weightlifting
+clawing
+hitchhiking
+autographing
+cooking
+gripping
+swerving
+frowning
+giving
+tattooing
+dipping
+leaking
+plunging
+barking
+stroking/petting
+piloting
+camping
+towing
+loading
+parading
+submerging
+squeezing
+sculpting
+stomping
+punting
+kissing
+smoking
+pouring
+texting
+adult+male+speaking
+adult+female+speaking
+crying
+unpacking
+pointing
+boating
+landing
+ironing
+crouching
+slapping
+typing
+ice+skating
+boiling
+chopping
+bowling
+fighting/attacking
+tapping
+applauding
+driving
+sprinting
+slicing
+approaching
+waving
+dusting
+wrapping
+knocking
+snapping
+gardening
+combing
+tickling
+carving
+smashing
+smiling/grinning
+dressing
+pressing
+lecturing
+telephoning
+exercising
+riding
+draining
+flying
+wrestling
+boxing
+rinsing
+overflowing
+inflating
+picking
+sowing
+shaving
+baking
+shaking
+running
+throwing
+stacking/piling
+buttoning
+leaping
+fueling
+pitching
+child+speaking
+breaking/destroying
+lifting
+filming/photographing
+singing
+reading
+chewing
+operating
+bubbling
+waxing
+cleaning/washing
+scooping
+erasing
+steering
+playing+videogames
+crashing
+constructing/assembling
+flooding
+drinking
+praying
+shouting
+winking
+dining
+repairing
+tying
+juggling
+rolling
+studying
+marching
+socializing
+ascending/rising
+arresting
+cracking
+laying
+clinging
+frying
+vacuuming
+combusting/burning
+filling
+standing
+howling
+dunking
+spraying
+bandaging
+shivering
+slipping
+racing
+roaring
+planting
+yawning
+grilling
+squinting
+skiing
+taping
+trimming
+preaching
+resting
+descending/lowering
+clearing
+screwing
+chasing
+speaking
+manicuring
+tripping
+performing
+teaching/instructing
+blowing
+painting
+sneezing
+packaging
+punching
+clapping
+rotating/spinning
+skating
+cheerleading
+balancing
+child+singing
+covering
+snuggling/cuddling/hugging
+bulldozing
+jumping
+sliding
+barbecuing
+weeding
+swimming
+shooting
+dialing
+measuring
+pulling
+celebrating
+playing+fun
+knitting
+spreading
+erupting
+snowboarding
+swinging
+protesting
+sitting
+inserting
+bouncing
+surfing
+extinguishing
+unloading
+aiming
+bathing
+hammering
+fishing
+opening
+biting
+packing
+saluting
+rafting
+laughing
+bicycling
+rocking
+storming
+wetting
+shrugging
+handwriting
+gambling
+writing
+skipping
+dragging
+unplugging
+kicking
+sawing
+grooming
+whistling
+floating
+diving
+rubbing
+bending
+shoveling/digging
+peeling
+catching
+closing
+eating/feeding
+falling
+discussing
+sweeping
+massaging
+locking
+dancing
+mowing
+clipping
+hanging
+burying
+reaching
+kayaking
+snowing
+sleeping
+climbing
+flipping
+tearing/ripping
+folding
+signing
+cutting
+stretching
+stirring
+licking
+kneeling
+sewing
+dripping
+queuing
+pushing
+pedaling
+flossing
+buying/selling/shopping
+smelling/sniffing
+emptying
+sanding
+smacking
+carrying
+adult+male+singing
+poking
+brushing
+adult+female+singing
+scratching
+welding
+crawling
+skateboarding
+turning
+dropping
+hunting
+cheering
+drawing
+sprinkling
+spitting
+competing
+bowing
+hiking
+drying
+launching
+twisting
+crushing
+hitting/colliding
+shredding
+plugging
+gasping
+rowing
+calling
+drumming
+walking
+removing
+waking
+stitching
+coughing
+playing+music
+playing+sports
+interviewing
+scrubbing
+splashing
+officiating
+mopping
+flowing
+sailing
+drilling
+squatting
+handcuffing
+spilling
+marrying
+injecting
+jogging
diff --git a/tools/data/mmit/preprocess_data.sh b/tools/data/mmit/preprocess_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4d54052c04babaa2e0fda1ccc6b2ae2c27ff0584
--- /dev/null
+++ b/tools/data/mmit/preprocess_data.sh
@@ -0,0 +1,20 @@
+DATA_DIR="../../../data/mmit/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+cd ${DATA_DIR}
+
+# Download the Multi_Moments_in_Time_Raw.zip here manually
+unzip Multi_Moments_in_Time_Raw.zip
+rm Multi_Moments_in_Time.zip
+
+if [ ! -d "./annotations" ]; then
+  mkdir ./annotations
+fi
+
+mv *.txt annotations && mv *.csv annotations
+
+cd -
diff --git a/tools/data/msrvtt/README.md b/tools/data/msrvtt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1699cb3f3ff19c3e5881c56a1ecdaba103fbb515
--- /dev/null
+++ b/tools/data/msrvtt/README.md
@@ -0,0 +1,68 @@
+# Preparing MSR-VTT Retrieval/ Video Question-Answering Dataset
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{xu2016msr,
+      title={Msr-vtt: A large video description dataset for bridging video and language},
+      author={Xu, Jun and Mei, Tao and Yao, Ting and Rui, Yong},
+      booktitle={CVPR},
+      pages={5288--5296},
+      year={2016}
+}
+```
+
+Before preparing the dataset, please make sure that the directory is located at `$MMACTION2/tools/data/msrvtt/`.
+
+## Step 1. Download Annotation Files
+
+You can directly download the following annotation files related to MSR-VTT from the [Google Drive link](https://drive.google.com/drive/folders/12cr94wT8j7pR09AR2nmQg6o26Y1arI50) provided by [VindLU](https://github.com/klauscc) and place them in the `$MMACTION2/tools/data/msrvtt/annotations` directory:
+
+- [msrvtt_qa_train.json](https://drive.google.com/file/d/12dJq5_7v8FytrJwrPB_f22tET1MmGCNh/view?usp=drive_link)
+- [msrvtt_qa_val.json](https://drive.google.com/file/d/138q-A-V8fCC2nBYJgqkQa3gBfXVNbNNd/view?usp=drive_link)
+- [msrvtt_qa_test.json](https://drive.google.com/file/d/13IiEcUMHiNppWhGwVY1eAaip6iSJM35A/view?usp=drive_link)
+- [msrvtt_qa_answer_list.json](https://drive.google.com/file/d/131euz_dssRkDTk3-ioAS5ZsvIxS_Tt4M/view?usp=drive_link)
+- [msrvtt_mc_test.json](https://drive.google.com/file/d/13FrUQ2ZDsNDraP7lfnKvTArPIgdtHuLC/view?usp=drive_link)
+- [msrvtt_ret_train9k.json](https://drive.google.com/file/d/13OVo0XRdVWTHlFFxbKg3daYCHsMbJxyd/view?usp=drive_link)
+- [msrvtt_ret_train7k.json](https://drive.google.com/file/d/13ID97BX4ExO6mWPIUMp-GzXcPBkviSLx/view?usp=drive_link)
+- [msrvtt_ret_test1k.json](https://drive.google.com/file/d/13FLrjI-aleKeU7LbJMDrYgktX7MbTbzu/view?usp=drive_link)
+- [msrvtt_test1k.json](https://drive.google.com/file/d/12z6y-DNwIfICSzOhekbJwSbf7z2hlibE/view?usp=drive_link)
+
+## Step 2. Prepare Video Data
+
+You can refer to the [official website](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/) of this dataset for basic information. Run the following commands to prepare the MSRVTT video files:
+
+```shell
+# Download original videos
+bash download_msrvtt.sh
+# Preprocess videos to lower FPS and dimensions
+bash compress_msrvtt.sh
+```
+
+After completing the above preparation steps, the directory structure will be as follows:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   └── msrvtt
+│   │   ├── annotations
+│   │   │   ├── msrvtt_qa_train.json
+│   │   │   ├── msrvtt_qa_val.json
+│   │   │   ├── msrvtt_qa_test.json
+│   │   │   ├── msrvtt_qa_answer_list.json
+│   │   │   ├── msrvtt_mc_test.json
+│   │   │   ├── msrvtt_ret_train9k.json
+│   │   │   ├── msrvtt_ret_train7k.json
+│   │   │   ├── msrvtt_ret_test1k.json
+│   │   │   └── msrvtt_test1k.json
+│   │   └── videos_2fps_224
+│   │       ├── video0.mp4
+│   │       ├── video1.mp4
+│   │       ├── ...
+│   │       └── video9999.mp4
+```
diff --git a/tools/data/msrvtt/README_zh-CN.md b/tools/data/msrvtt/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..16f8e9ba05d770b66ab6b81438981817d8a850ef
--- /dev/null
+++ b/tools/data/msrvtt/README_zh-CN.md
@@ -0,0 +1,68 @@
+# 准备 MSR-VTT 检索/视频问答数据集
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{xu2016msr,
+      title={Msr-vtt: A large video description dataset for bridging video and language},
+      author={Xu, Jun and Mei, Tao and Yao, Ting and Rui, Yong},
+      booktitle={CVPR},
+      pages={5288--5296},
+      year={2016}
+}
+```
+
+在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/msrvtt/`。
+
+## 步骤 1. 下载标注文件
+
+用户可从 [VindLU](https://github.com/klauscc/VindLU) 提供的 [Google Drive 链接](https://drive.google.com/drive/folders/12cr94wT8j7pR09AR2nmQg6o26Y1arI50)中直接下载以下与 MSR-VTT 相关的标注文件, 并放置到 `$MMACTION2/tools/data/msrvtt/annotations` 路径下:
+
+- [msrvtt_qa_train.json](https://drive.google.com/file/d/12dJq5_7v8FytrJwrPB_f22tET1MmGCNh/view?usp=drive_link)
+- [msrvtt_qa_val.json](https://drive.google.com/file/d/138q-A-V8fCC2nBYJgqkQa3gBfXVNbNNd/view?usp=drive_link)
+- [msrvtt_qa_test.json](https://drive.google.com/file/d/13IiEcUMHiNppWhGwVY1eAaip6iSJM35A/view?usp=drive_link)
+- [msrvtt_qa_answer_list.json](https://drive.google.com/file/d/131euz_dssRkDTk3-ioAS5ZsvIxS_Tt4M/view?usp=drive_link)
+- [msrvtt_mc_test.json](https://drive.google.com/file/d/13FrUQ2ZDsNDraP7lfnKvTArPIgdtHuLC/view?usp=drive_link)
+- [msrvtt_ret_train9k.json](https://drive.google.com/file/d/13OVo0XRdVWTHlFFxbKg3daYCHsMbJxyd/view?usp=drive_link)
+- [msrvtt_ret_train7k.json](https://drive.google.com/file/d/13ID97BX4ExO6mWPIUMp-GzXcPBkviSLx/view?usp=drive_link)
+- [msrvtt_ret_test1k.json](https://drive.google.com/file/d/13FLrjI-aleKeU7LbJMDrYgktX7MbTbzu/view?usp=drive_link)
+- [msrvtt_test1k.json](https://drive.google.com/file/d/12z6y-DNwIfICSzOhekbJwSbf7z2hlibE/view?usp=drive_link)
+
+## 步骤 2. 准备视频数据
+
+用户可参考该数据集的[官网](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/)，以获取数据集相关的基本信息。运行下面的命令准备 MSRVTT 视频文件:
+
+```shell
+# download original videos
+bash download_msrvtt.sh
+# preprocess videos to lower FPS and dimension
+bash compress_msrvtt.sh
+```
+
+完成上述准备步骤后，文件目录如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   └── msrvtt
+│   │   ├── annotations
+│   │   │   ├── msrvtt_qa_train.json
+│   │   │   ├── msrvtt_qa_val.json
+│   │   │   ├── msrvtt_qa_test.json
+│   │   │   ├── msrvtt_qa_answer_list.json
+│   │   │   ├── msrvtt_mc_test.json
+│   │   │   ├── msrvtt_ret_train9k.json
+│   │   │   ├── msrvtt_ret_train7k.json
+│   │   │   ├── msrvtt_ret_test1k.json
+│   │   │   └── msrvtt_test1k.json
+│   │   └── videos_2fps_224
+│   │       ├── video0.mp4
+│   │       ├── video1.mp4
+│   │       ├── ...
+│   │       └── video9999.mp4
+```
diff --git a/tools/data/msrvtt/compress.py b/tools/data/msrvtt/compress.py
new file mode 100644
index 0000000000000000000000000000000000000000..5daca13f95967832d888e9086dd9aa4f438f38dd
--- /dev/null
+++ b/tools/data/msrvtt/compress.py
@@ -0,0 +1,192 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Used to compress videos (FPS and dimensions) in the Singularity project.
+
+copied from https://github.com/klauscc/VindLU
+"""
+import argparse
+import os
+import shutil
+import subprocess
+from multiprocessing import Pool
+from os.path import exists, join
+from pathlib import Path
+
+try:
+    from psutil import cpu_count
+except ImportError:
+    from multiprocessing import cpu_count
+
+from functools import partial
+
+from PIL import Image
+from tqdm import tqdm
+
+
+def resize_image(input_path, output_path, size=224):
+    with Image.open(input_path) as img:
+        w, h = img.width, img.height
+        r = 1. * w / h
+        if w > h:
+            h = size
+            w = r * size
+        else:
+            h = size / r
+            w = size
+
+        img_resized = img.resize((int(w), int(h)))
+        img_resized.save(output_path)
+
+
+def _compress_images(input_output_pair, size=224):
+    """Scale and downsample an input image to a given fps and size (shorter
+    side size).
+
+    This also removes the audio from the image.
+    """
+    input_image_path, output_image_path = input_output_pair
+    try:
+        resize_image(input_image_path, output_image_path, size)
+    except Exception as e:
+        print(f'Caught Exception {e}')
+
+
+def _compress_videos(input_output_pair, size=224, fps=3):
+    """Scale and downsample an input video to a given fps and size (shorter
+    side size).
+
+    This also removes the audio from the video.
+    """
+    input_file_path, output_file_path = input_output_pair
+    try:
+        command = [
+            'ffmpeg',
+            '-y',  # (optional) overwrite output file if it exists
+            '-i',
+            input_file_path,
+            '-filter:v',  # no audio
+            f"scale='if(gt(a,1),trunc(oh*a/2)*2,{size})':'if(gt(a,1),{size},trunc(ow*a/2)*2)'",  # noqa: E501
+            '-map',
+            '0:v',  # no audio
+            '-r',
+            str(fps),  # frames per second
+            # '-g', str(16),
+            output_file_path,
+        ]
+        subprocess.run(
+            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    except Exception as e:
+        raise e
+
+
+def _compress(input_output_pair, fps=3, size=224, file_type='image'):
+    if file_type == 'image':
+        _compress_images(input_output_pair, size)
+    elif file_type == 'video':
+        _compress_videos(input_output_pair, size, fps)
+
+
+def prepare_input_output_pairs(input_root,
+                               output_root,
+                               input_file_list_path=None):
+    # filename list in `input_file_list_path` can be created very fast using `ls -U . >> ../video_filenames.txt`  # noqa: E501
+    if input_file_list_path:
+        with open(input_file_list_path, 'r') as f:
+            filenames = [s.strip() for s in f.readlines()]
+    else:
+        filenames = [
+            video_path.name for video_path in Path(input_root).glob('*.mp4')
+        ]
+    print(f'There are {len(filenames)} video/images files loaded from list.')
+    input_file_path_list = []
+    output_file_path_list = []
+    for e in tqdm(filenames, desc='find un-processed videos/images'):
+        input_file_path = join(input_root, e)
+        output_file_path = join(output_root, e)
+        if not exists(output_file_path):
+            input_file_path_list.append(input_file_path)
+            output_file_path_list.append(output_file_path)
+    return input_file_path_list, output_file_path_list
+
+
+def run_compress():
+    parser = argparse.ArgumentParser(
+        description='Compress videos/images for speed-up')
+    parser.add_argument(
+        '--input_root', type=str, help='input root', required=True)
+    parser.add_argument(
+        '--input_file_list_path',
+        type=str,
+        default=None,
+        help='list of video filenames under args.input_root, it can be '
+        'created efficiently with `ls -U /path/to/video >> /path/to/video_filenames.txt`'  # noqa: E501
+    )
+    parser.add_argument(
+        '--output_root', type=str, help='output root', required=True)
+    parser.add_argument(
+        '--size',
+        type=int,
+        default=224,
+        help='shorter side size, aspect ratio is kept')
+    parser.add_argument('--num_workers', type=int, default=24, help='#workers')
+    parser.add_argument(
+        '--fps',
+        type=int,
+        default=3,
+        help='fps for output video, ignored if file_type == image')
+    parser.add_argument(
+        '--file_type',
+        type=str,
+        choices=['image', 'video'],
+        help='input file type')
+    args = parser.parse_args()
+
+    # set paths
+    input_root = args.input_root
+    output_root = args.output_root
+    assert input_root != output_root
+    if not exists(output_root):
+        os.makedirs(output_root, exist_ok=True)
+
+    # prepare and find un-processed
+    input_file_path_list, output_file_path_list = prepare_input_output_pairs(
+        input_root,
+        output_root,
+        input_file_list_path=args.input_file_list_path,
+    )
+    print(f'input_file_path_list[:3] {input_file_path_list[:3]}')
+    print(f'output_file_path_list[:3] {output_file_path_list[:3]}')
+    print('Total videos/images need to process: {}'.format(
+        len(input_file_path_list)))
+
+    # start parallel jobs
+    num_cores = cpu_count()
+    num_workers = args.num_workers
+    print(
+        f'Begin with {num_cores}-core logical processor, {num_workers} workers'
+    )
+    compress = partial(
+        _compress, fps=args.fps, size=args.size, file_type=args.file_type)
+    input_pairs = list(zip(input_file_path_list, output_file_path_list))
+    with Pool(num_workers) as pool, tqdm(
+            total=len(input_file_path_list),
+            desc='re-encoding videos/images') as pbar:
+        for idx, _ in enumerate(
+                pool.imap_unordered(compress, input_pairs, chunksize=32)):
+            pbar.update(1)
+
+    # copy-paste failed files
+    print('Compress finished, copy-paste failed files...')
+    copy_count = 0
+    for input_file_path, output_file_path in zip(input_file_path_list,
+                                                 output_file_path_list):
+        if exists(input_file_path):
+            if exists(output_file_path) is False or os.path.getsize(
+                    output_file_path) < 1.:
+                copy_count += 1
+                shutil.copyfile(input_file_path, output_file_path)
+                print('Copy and replace file: {}'.format(output_file_path))
+    print(f'copy_count {copy_count}')
+
+
+if __name__ == '__main__':
+    run_compress()
diff --git a/tools/data/msrvtt/compress_msrvtt.sh b/tools/data/msrvtt/compress_msrvtt.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b62744fbe977960a9d90e254460c441508a28561
--- /dev/null
+++ b/tools/data/msrvtt/compress_msrvtt.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+FPS=2
+SIZE=224
+DATA_DIR="../../../data/msrvtt/videos"
+OUT_DIR="../../../data/msrvtt/videos_2fps_224"
+
+python compress.py \
+    --input_root=${DATA_DIR} --output_root=${OUT_DIR} \
+    --fps=${FPS} --size=${SIZE} --file_type=video --num_workers 24
diff --git a/tools/data/msrvtt/download_msrvtt.sh b/tools/data/msrvtt/download_msrvtt.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9a7d1890855560b573b8b75544fe30541f7230da
--- /dev/null
+++ b/tools/data/msrvtt/download_msrvtt.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+DATA_DIR="../../../data/msrvtt"
+mkdir -p ${DATA_DIR}
+
+if [ -f "MSRVTT.zip" ]; then
+    echo "MSRVTT.zip exists, skip downloading!"
+else
+    echo "Downloading MSRVTT.zip."
+    wget https://www.robots.ox.ac.uk/~maxbain/frozen-in-time/data/MSRVTT.zip
+fi
+
+echo "Processing videos started."
+unzip -q MSRVTT.zip -d ${DATA_DIR}
+mkdir -p "${DATA_DIR}/videos/" && find "${DATA_DIR}/MSRVTT/videos/all" -name "video*.mp4" -exec mv {} "${DATA_DIR}/videos/" \;
+echo "Processing videos completed."
+
+rm -rf "${DATA_DIR}/MSRVTT"
+rm -rf "${DATA_DIR}/msrvtt_data"
+rm msrvtt_data.zip
+rm MSRVTT.zip
+echo "The preparation of the msrvtt dataset has been successfully completed."
diff --git a/tools/data/multisports/README.md b/tools/data/multisports/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..54f903e18ec3f574c366f9ef332a4bb6ef1bd5e7
--- /dev/null
+++ b/tools/data/multisports/README.md
@@ -0,0 +1,111 @@
+# Preparing Multisports
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{li2021multisports,
+  title={Multisports: A multi-person video dataset of spatio-temporally localized sports actions},
+  author={Li, Yixuan and Chen, Lei and He, Runyu and Wang, Zhenzhi and Wu, Gangshan and Wang, Limin},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={13536--13545},
+  year={2021}
+}
+```
+
+For basic dataset information, please refer to the official [project](https://deeperaction.github.io/datasets/multisports.html) and the [paper](https://arxiv.org/abs/2105.07404).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/multisports/`.
+
+## Step 1. Prepare Annotations
+
+First of all, you have to download annotations and videos to `$MMACTION2/data/multisports` on the official [website](https://github.com/MCG-NJU/MultiSports), please also download the Person Boxes and put it to `$MMACTION2/data/multisports`.
+
+## Step 2. Prepare Videos
+
+Before this step, please make sure the folder structure looks like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── multisports
+│   |   ├── MultiSports_box.zip
+│   |   ├── trainval
+│   |   |   ├── aerobic_gymnastics.zip
+│   |   |   ├── basketball.zip
+│   |   |   ├── multisports_GT.pkl
+│   |   |   ├──...
+│   |   ├── test
+│   |   |   ├── aerobic_gymnastics.zip
+│   |   |   ├── basketball.zip
+│   |   |   ├──...
+```
+
+Then, you can use the following command to uncompress.
+
+```shell
+cd $MMACTION2/data/multisports/
+unzip MultiSports_box.zip
+cd $MMACTION2/data/multisports/trainval
+find . -name '*.zip' -exec unzip {} \;
+cd $MMACTION2/data/multisports/test
+find . -name '*.zip' -exec unzip {} \;
+cd $MMACTION2/tools/data/multisports/
+```
+
+## Step 3. Convert Annotations
+
+you can run the following script to convert annotations and proposals as we need.
+
+```shell
+cd $MMACTION2/tools/data/multisports/
+python parse_anno.py
+```
+
+## Step 5. Check Directory Structure
+
+After the whole data process, you will get the videos and annotation files for MultiSports.
+
+In the context of the whole project (for MultiSports only), the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── multisports
+│   |   ├── annotations
+|   │   |   ├── multisports_dense_proposals_test.recall_96.13.pkl
+|   │   |   ├── multisports_dense_proposals_train.recall_96.13.pkl
+|   │   |   ├── multisports_dense_proposals_val.recall_96.13.pkl
+|   │   |   ├── multisports_GT.pkl
+|   │   |   ├── multisports_train.csv
+|   │   |   ├── multisports_val.csv
+│   |   ├── trainval
+│   |   |   ├── aerobic_gymnastics
+|   │   |   |   ├── v__wAgwttPYaQ_c001.mp4
+|   │   |   |   ├── v__wAgwttPYaQ_c002.mp4
+|   │   |   |   ├── ...
+│   |   |   ├── basketball
+|   │   |   |   ├── v_-6Os86HzwCs_c001.mp4
+|   │   |   |   ├── v_-6Os86HzwCs_c002.mp4
+|   │   |   |   ├── ...
+│   |   |   ├── multisports_GT.pkl
+│   |   |   ├──...
+│   |   ├── test
+│   |   |   ├── aerobic_gymnastics
+|   │   |   |   ├── v_2KroSzspz-c_c001.mp4
+|   │   |   |   ├── v_2KroSzspz-c_c002.mp4
+|   │   |   |   ├── ...
+│   |   |   ├── basketball
+|   │   |   |   ├── v_1tefH1iPbGM_c001.mp4
+|   │   |   |   ├── v_1tefH1iPbGM_c002.mp4
+│   |   |   ├──...
+```
+
+We don't need the zip files under the project, you can handle them as you want.
+For training and evaluating on MultiSports, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/multisports/README_zh-CN.md b/tools/data/multisports/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..39a35e9b4d918491af9aae89ddc3d0774c43c8a2
--- /dev/null
+++ b/tools/data/multisports/README_zh-CN.md
@@ -0,0 +1,111 @@
+# 准备 MultiSports
+
+## 介绍
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{li2021multisports,
+  title={Multisports: A multi-person video dataset of spatio-temporally localized sports actions},
+  author={Li, Yixuan and Chen, Lei and He, Runyu and Wang, Zhenzhi and Wu, Gangshan and Wang, Limin},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={13536--13545},
+  year={2021}
+}
+```
+
+关于基本数据集信息，请参考官方 [项目](https://deeperaction.github.io/datasets/multisports.html) 和 [论文](https://arxiv.org/abs/2105.07404)。
+在我们开始之前，请确保目录位于 `$MMACTION2/tools/data/multisports/`。
+
+## 第一步：准备标注
+
+首先，你必须从官方 [网站](https://github.com/MCG-NJU/MultiSports) 下载标注和视频到 `$MMACTION2/data/multisports`，请同时下载人物检测框并将其放到 `$MMACTION2/data/multisports`。
+
+## 第二步：准备视频
+
+在这一步之前，请确保文件夹结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── multisports
+│   |   ├── MultiSports_box.zip
+│   |   ├── trainval
+│   |   |   ├── aerobic_gymnastics.zip
+│   |   |   ├── basketball.zip
+│   |   |   ├── multisports_GT.pkl
+│   |   |   ├──...
+│   |   ├── test
+│   |   |   ├── aerobic_gymnastics.zip
+│   |   |   ├── basketball.zip
+│   |   |   ├──...
+```
+
+然后，你可以使用以下命令进行解压。
+
+```shell
+cd $MMACTION2/data/multisports/
+unzip MultiSports_box.zip
+cd $MMACTION2/data/multisports/trainval
+find . -name '*.zip' -exec unzip {} \;
+cd $MMACTION2/data/multisports/test
+find . -name '*.zip' -exec unzip {} \;
+cd $MMACTION2/tools/data/multisports/
+```
+
+## 第三步：转换标注文件
+
+你可以运行以下脚本来转换我们需要的标注文件和候选框。
+
+```shell
+cd $MMACTION2/tools/data/multisports/
+python parse_anno.py
+```
+
+## 第五步：检查目录结构
+
+完成整个数据处理后，你将得到 MultiSports 数据集的视频和标注文件。
+
+在整个项目的目录中（仅针对 MultiSports），文件夹结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── multisports
+│   |   ├── annotations
+|   │   |   ├── multisports_dense_proposals_test.recall_96.13.pkl
+|   │   |   ├── multisports_dense_proposals_train.recall_96.13.pkl
+|   │   |   ├── multisports_dense_proposals_val.recall_96.13.pkl
+|   │   |   ├── multisports_GT.pkl
+|   │   |   ├── multisports_train.csv
+|   │   |   ├── multisports_val.csv
+│   |   ├── trainval
+│   |   |   ├── aerobic_gymnastics
+|   │   |   |   ├── v__wAgwttPYaQ_c001.mp4
+|   │   |   |   ├── v__wAgwttPYaQ_c002.mp4
+|   │   |   |   ├── ...
+│   |   |   ├── basketball
+|   │   |   |   ├── v_-6Os86HzwCs_c001.mp4
+|   │   |   |   ├── v_-6Os86HzwCs_c002.mp4
+|   │   |   |   ├── ...
+│   |   |   ├── multisports_GT.pkl
+│   |   |   ├──...
+│   |   ├── test
+│   |   |   ├── aerobic_gymnastics
+|   │   |   |   ├── v_2KroSzspz-c_c001.mp4
+|   │   |   |   ├── v_2KroSzspz-c_c002.mp4
+|   │   |   |   ├── ...
+│   |   |   ├── basketball
+|   │   |   |   ├── v_1tefH1iPbGM_c001.mp4
+|   │   |   |   ├── v_1tefH1iPbGM_c002.mp4
+│   |   |   ├──...
+```
+
+我们不需要项目下的 zip 文件，你可以按照自己的意愿处理它们。
+关于在 MultiSports 上进行训练和评估，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/multisports/format_det_result.py b/tools/data/multisports/format_det_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..693036f0d6b2777aa59b0f9bc5f9959f7b4977e9
--- /dev/null
+++ b/tools/data/multisports/format_det_result.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+
+import numpy as np
+from mmengine import dump, load
+from rich.progress import track
+
+from mmaction.evaluation import link_tubes
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('test-result', help='path of dumped reuslts')
+    parser.add_argument(
+        '--anno-path',
+        default='data/multisports/videos/trainval/multisports_GT.pkl')
+    parser.add_argument(
+        '--frm_out_path',
+        default=None,
+        help='frame-level detection results output path')
+    parser.add_argument(
+        '--tube_out_path',
+        default=None,
+        help='tube-level detection results output path')
+    args = parser.parse_args()
+    if not args.frm_out_path:
+        args.frm_out_path = args.test_result[:-4] + '-formated.pkl'
+    if not args.tube_out_path:
+        args.tube_out_path = args.test_result[:-4] + '_vid_dets.pkl'
+    return args
+
+
+def format_det_result():
+    """convert test results to specified format in MultiSports competition."""
+    test_results = load(args.test_result)
+    annos = load(args.anno_path)
+    test_videos = annos['test_videos'][0]
+    resolutions = annos['resolution']
+    frm_dets = []
+    for pred in track(test_results, description='formating...'):
+        video_key = pred['video_id'].split('.mp4')[0]
+        frm_num = pred['timestamp']
+        bboxes = pred['pred_instances']['bboxes']
+        cls_scores = pred['pred_instances']['scores']
+        for bbox, cls_score in zip(bboxes, cls_scores):
+            video_idx = test_videos.index(video_key)
+            pred_label = np.argmax(cls_score)
+            score = cls_score[pred_label]
+            h, w = resolutions[video_key]
+            bbox *= np.array([w, h, w, h])
+            instance_result = np.array(
+                [video_idx, frm_num, pred_label, score, *bbox])
+            frm_dets.append(instance_result)
+    frm_dets = np.array(frm_dets)
+    video_tubes = link_tubes(annos, frm_dets, K=1)
+    dump(frm_dets, args.frm_out_path)
+    dump(video_tubes, args.tube_out_path)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    format_det_result()
diff --git a/tools/data/multisports/label_map.txt b/tools/data/multisports/label_map.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7a4428a54a612eb653f9aae9005d72ccad7530b4
--- /dev/null
+++ b/tools/data/multisports/label_map.txt
@@ -0,0 +1,66 @@
+0: aerobic_push_up
+1: aerobic_explosive_push_up
+2: aerobic_explosive_support
+3: aerobic_leg_circle
+4: aerobic_helicopter
+5: aerobic_support
+6: aerobic_v_support
+7: aerobic_horizontal_support
+8: aerobic_straight_jump
+9: aerobic_illusion
+10: aerobic_bent_leg(s)_jump
+11: aerobic_pike_jump
+12: aerobic_straddle_jump
+13: aerobic_split_jump
+14: aerobic_scissors_leap
+15: aerobic_kick_jump
+16: aerobic_off_axis_jump
+17: aerobic_butterfly_jump
+18: aerobic_split
+19: aerobic_turn
+20: aerobic_balance_turn
+21: volleyball_serve
+22: volleyball_block
+23: volleyball_first_pass
+24: volleyball_defend
+25: volleyball_protect
+26: volleyball_second_pass
+27: volleyball_adjust
+28: volleyball_save
+29: volleyball_second_attack
+30: volleyball_spike
+31: volleyball_dink
+32: volleyball_no_offensive_attack
+33: football_shoot
+34: football_long_pass
+35: football_short_pass
+36: football_through_pass
+37: football_cross
+38: football_dribble
+39: football_trap
+40: football_throw
+41: football_diving
+42: football_tackle
+43: football_steal
+44: football_clearance
+45: football_block
+46: football_press
+47: football_aerial_duels
+48: basketball_pass
+49: basketball_drive
+50: basketball_dribble
+51: basketball_3-point_shot
+52: basketball_2-point_shot
+53: basketball_free_throw
+54: basketball_block
+55: basketball_offensive_rebound
+56: basketball_defensive_rebound
+57: basketball_pass_steal
+58: basketball_dribble_steal
+59: basketball_interfere_shot
+60: basketball_pick-and-roll_defensive
+61: basketball_sag
+62: basketball_screen
+63: basketball_pass-inbound
+64: basketball_save
+65: basketball_jump_ball
diff --git a/tools/data/multisports/parse_anno.py b/tools/data/multisports/parse_anno.py
new file mode 100644
index 0000000000000000000000000000000000000000..4987bc385996d835b9bb5e385fe5f494c9933bbe
--- /dev/null
+++ b/tools/data/multisports/parse_anno.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import csv
+import os
+import os.path as osp
+from argparse import ArgumentParser
+
+import numpy as np
+from mmengine import dump, list_dir_or_file, load
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        '--data-root',
+        default='data/multisports',
+        help='the directory to multisports annotations')
+    parser.add_argument(
+        '--out-root',
+        default='data/multisports',
+        help='output directory of output annotation files')
+    parser.add_argument('--dump-proposals', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+def parse_anno(args):
+    if not osp.exists(args.out_root):
+        os.makedirs(osp.join(args.out_root, 'annotations'))
+
+    anno_path = osp.join(args.data_root, 'annotations/multisports_GT.pkl')
+    annos = load(anno_path)
+
+    # convert key in proposal file to filename
+    key2filename = {
+        video.split('/')[1]: video + '.mp4'
+        for video in annos['nframes'].keys()
+    }
+    test_videos = [
+        file for file in list_dir_or_file(
+            osp.join(args.data_root, 'test'), recursive=True)
+        if file.endswith('.mp4')
+    ]
+    key2filename.update(
+        {video.split('/')[1][:-4]: video
+         for video in test_videos})
+    # convert proposal bboxes
+    if args.dump_proposals:
+        proposals_path = osp.join(args.data_root,
+                                  'annotations/MultiSports_box')
+        for proposals in os.listdir(proposals_path):
+            proposal_info = load(osp.join(proposals_path, proposals))
+            proposal_out = dict()
+            for key in proposal_info.keys():
+                key_split = key.split(',')
+                if key_split[0] in key2filename.keys():
+                    new_key = \
+                        f'{key2filename[key_split[0]]},{int(key_split[1]):04d}'
+                proposal_out[new_key] = proposal_info[key]
+            target_path = osp.join(args.out_root, 'annotations',
+                                   'multisports_dense_proposals_' + proposals)
+            dump(proposal_out, target_path)
+    # dump train and val list
+    for split in ['train', 'val']:
+        out_anno_path = osp.join(args.out_root, 'annotations',
+                                 f'multisports_{split}.csv')
+        with open(out_anno_path, 'w') as csv_f:
+            writer = csv.writer(csv_f)
+            if split == 'train':
+                video_list = annos['train_videos'][0]
+            elif split == 'val':
+                video_list = annos['test_videos'][0]
+            gt_tubes = annos['gttubes']
+            resolutions = annos['resolution']
+            for video_id in video_list:
+                vid_tubes = gt_tubes[video_id]
+                h, w = resolutions[video_id]
+                for label, tubes in vid_tubes.items():
+                    entity_id = 0
+                    for tube in tubes:
+                        for frame_anno in tube:
+                            frame_stamp = int(frame_anno[0])
+                            entity_box = frame_anno[1:]
+                            entity_box /= np.array([w, h, w, h])
+                            entity_box = [f'{num:.3f}' for num in entity_box]
+                            filename = video_id + '.mp4'
+                            anno_line = [
+                                filename, frame_stamp, *entity_box, label,
+                                entity_id
+                            ]
+                            writer.writerow(anno_line)
+                        entity_id += 1
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    parse_anno(args)
diff --git a/tools/data/omnisource/README.md b/tools/data/omnisource/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1e0ea002937683e520b5c6a69ba2d7cc5901c52d
--- /dev/null
+++ b/tools/data/omnisource/README.md
@@ -0,0 +1,150 @@
+# Preparing OmniSource
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{duan2020omni,
+  title={Omni-sourced Webly-supervised Learning for Video Recognition},
+  author={Duan, Haodong and Zhao, Yue and Xiong, Yuanjun and Liu, Wentao and Lin, Dahua},
+  journal={arXiv preprint arXiv:2003.13042},
+  year={2020}
+}
+```
+
+We release a subset of the OmniSource web dataset used in the paper [Omni-sourced Webly-supervised Learning for Video Recognition](https://arxiv.org/abs/2003.13042). Since all web dataset in OmniSource are built based on the Kinetics-400 taxonomy, we select those web data related to the 200 classes in Mini-Kinetics subset (which is proposed in [Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video Classification](https://arxiv.org/pdf/1712.04851.pdf)).
+
+We provide data from all sources that are related to the 200 classes in Mini-Kinetics (including Kinetics trimmed clips, Kinetics untrimmed videos, images from Google and Instagram, video clips from Instagram).  To obtain this dataset, please first fill in the [request form](https://docs.google.com/forms/d/e/1FAIpQLSd8_GlmHzG8FcDbW-OEu__G7qLgOSYZpH-i5vYVJcu7wcb_TQ/viewform?usp=sf_link). We will share the download link to you after your request is received. Since we release all data crawled from the web without any filtering, the dataset is large and it may take some time to download them. We describe the size of the datasets in the following table:
+
+|  Dataset Name   | #samples |  Size   |  Teacher Model   | #samples after filtering | #samples similar to k200_val |
+| :-------------: | :------: | :-----: | :--------------: | :----------------------: | :--------------------------: |
+|   k200_train    |  76030   |  45.6G  |       N/A        |           N/A            |             N/A              |
+|    k200_val     |   4838   |  2.9G   |       N/A        |           N/A            |             N/A              |
+| googleimage_200 | 3050880  | 265.5G  |   TSN-R50-8seg   |         1188695          |             967              |
+|  insimage_200   | 3654650  | 224.4G  |   TSN-R50-8seg   |          879726          |             116              |
+|  insvideo_200   |  732855  | 1487.6G | SlowOnly-8x8-R50 |          330680          |             956              |
+| k200_raw_train  |  76027   | 963.5G  | SlowOnly-8x8-R50 |           N/A            |             N/A              |
+
+The file structure of our uploaded OmniSource dataset looks like:
+
+```
+OmniSource/
+├── annotations
+│   ├── googleimage_200
+│   │   ├── googleimage_200.txt                       File list of all valid images crawled from Google.
+│   │   ├── tsn_8seg_googleimage_200_duplicate.txt    Positive file list of images crawled from Google, which is similar to a validation example.
+│   │   ├── tsn_8seg_googleimage_200.txt              Positive file list of images crawled from Google, filtered by the teacher model.
+│   │   └── tsn_8seg_googleimage_200_wodup.txt        Positive file list of images crawled from Google, filtered by the teacher model, after de-duplication.
+│   ├── insimage_200
+│   │   ├── insimage_200.txt
+│   │   ├── tsn_8seg_insimage_200_duplicate.txt
+│   │   ├── tsn_8seg_insimage_200.txt
+│   │   └── tsn_8seg_insimage_200_wodup.txt
+│   ├── insvideo_200
+│   │   ├── insvideo_200.txt
+│   │   ├── slowonly_8x8_insvideo_200_duplicate.txt
+│   │   ├── slowonly_8x8_insvideo_200.txt
+│   │   └── slowonly_8x8_insvideo_200_wodup.txt
+│   ├── k200_actions.txt                              The list of action names of the 200 classes in MiniKinetics.
+│   ├── K400_to_MiniKinetics_classidx_mapping.json    The index mapping from Kinetics-400 to MiniKinetics.
+│   ├── kinetics_200
+│   │   ├── k200_train.txt
+│   │   └── k200_val.txt
+│   ├── kinetics_raw_200
+│   │   └── slowonly_8x8_kinetics_raw_200.json        Kinetics Raw Clips filtered by the teacher model.
+│   └── webimage_200
+│       └── tsn_8seg_webimage_200_wodup.txt           The union of `tsn_8seg_googleimage_200_wodup.txt` and `tsn_8seg_insimage_200_wodup.txt`
+├── googleimage_200                                   (10 volumes)
+│   ├── vol_0.tar
+│   ├── ...
+│   └── vol_9.tar
+├── insimage_200                                      (10 volumes)
+│   ├── vol_0.tar
+│   ├── ...
+│   └── vol_9.tar
+├── insvideo_200                                      (20 volumes)
+│   ├── vol_00.tar
+│   ├── ...
+│   └── vol_19.tar
+├── kinetics_200_train
+│   └── kinetics_200_train.tar
+├── kinetics_200_val
+│   └── kinetics_200_val.tar
+└── kinetics_raw_200_train                            (16 volumes)
+    ├── vol_0.tar
+    ├── ...
+    └── vol_15.tar
+```
+
+## Data Preparation
+
+For data preparation, you need to first download those data. For `kinetics_200` and 3 web datasets: `googleimage_200`, `insimage_200` and `insvideo_200`, you just need to extract each volume and merge their contents.
+
+For Kinetics raw videos, since loading long videos is very heavy, you need to first trim it into clips. Here we provide a script named `trim_raw_video.py`. It trims a long video into 10-second clips and remove the original raw video. You can use it to trim the Kinetics raw video.
+
+The data should be placed in `data/OmniSource/`. When data preparation finished, the folder structure of `data/OmniSource` looks like (We omit the files not needed in training & testing for simplicity):
+
+```
+data/OmniSource/
+├── annotations
+│   ├── googleimage_200
+│   │   └── tsn_8seg_googleimage_200_wodup.txt    Positive file list of images crawled from Google, filtered by the teacher model, after de-duplication.
+│   ├── insimage_200
+│   │   └── tsn_8seg_insimage_200_wodup.txt
+│   ├── insvideo_200
+│   │   └── slowonly_8x8_insvideo_200_wodup.txt
+│   ├── kinetics_200
+│   │   ├── k200_train.txt
+│   │   └── k200_val.txt
+│   ├── kinetics_raw_200
+│   │   └── slowonly_8x8_kinetics_raw_200.json    Kinetics Raw Clips filtered by the teacher model.
+│   └── webimage_200
+│       └── tsn_8seg_webimage_200_wodup.txt       The union of `tsn_8seg_googleimage_200_wodup.txt` and `tsn_8seg_insimage_200_wodup.txt`
+├── googleimage_200
+│   ├── 000
+|   │   ├── 00
+|   │   │   ├── 000001.jpg
+|   │   │   ├── ...
+|   │   │   └── 000901.jpg
+|   │   ├── ...
+|   │   ├── 19
+│   ├── ...
+│   └── 199
+├── insimage_200
+│   ├── 000
+|   │   ├── abseil
+|   │   │   ├── 1J9tKWCNgV_0.jpg
+|   │   │   ├── ...
+|   │   │   └── 1J9tKWCNgV_0.jpg
+|   │   ├── abseiling
+│   ├── ...
+│   └── 199
+├── insvideo_200
+│   ├── 000
+|   │   ├── abseil
+|   │   │   ├── B00arxogubl.mp4
+|   │   │   ├── ...
+|   │   │   └── BzYsP0HIvbt.mp4
+|   │   ├── abseiling
+│   ├── ...
+│   └── 199
+├── kinetics_200_train
+│   ├── 0074cdXclLU.mp4
+|   ├── ...
+|   ├── zzzlyL61Fyo.mp4
+├── kinetics_200_val
+│   ├── 01fAWEHzudA.mp4
+|   ├── ...
+|   ├── zymA_6jZIz4.mp4
+└── kinetics_raw_200_train
+│   ├── pref_
+│   |   ├── ___dTOdxzXY
+|   │   │   ├── part_0.mp4
+|   │   │   ├── ...
+|   │   │   ├── part_6.mp4
+│   |   ├── ...
+│   |   └── _zygwGDE2EM
+│   ├── ...
+│   └── prefZ
+```
diff --git a/tools/data/omnisource/README_zh-CN.md b/tools/data/omnisource/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..c2aeb5c1c2211d2e7c6002ff560b381e8302a2c2
--- /dev/null
+++ b/tools/data/omnisource/README_zh-CN.md
@@ -0,0 +1,149 @@
+# 准备 OmniSource
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{duan2020omni,
+  title={Omni-sourced Webly-supervised Learning for Video Recognition},
+  author={Duan, Haodong and Zhao, Yue and Xiong, Yuanjun and Liu, Wentao and Lin, Dahua},
+  journal={arXiv preprint arXiv:2003.13042},
+  year={2020}
+}
+```
+
+MMAction2 中发布了 OmniSource 网络数据集的一个子集 (来自论文 [Omni-sourced Webly-supervised Learning for Video Recognition](https://arxiv.org/abs/2003.13042))。
+OmniSource 数据集中所有类别均来自 Kinetics-400。MMAction2 所提供的子集包含属于 Mini-Kinetics 数据集 200 类动作的网络数据 (Mini-inetics 数据集由论文 [Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video Classification](https://arxiv.org/pdf/1712.04851.pdf) 提出)。
+
+MMAction2 提供所有数据源中属于 Mini-Kinetics 200 类动作的数据，这些数据源包含：Kinetics 数据集，Kinetics 原始数据集（未经裁剪的长视频），来自 Google 和 Instagram 的网络图片，来自 Instagram 的网络视频。为获取这一数据集，用户需先填写 [数据申请表](https://docs.google.com/forms/d/e/1FAIpQLSd8_GlmHzG8FcDbW-OEu__G7qLgOSYZpH-i5vYVJcu7wcb_TQ/viewform?usp=sf_link)。在接收到申请后，下载链接将被发送至用户邮箱。由于发布的数据集均为爬取所得的原始数据，数据集较大，下载需要一定时间。下表中提供了 OmniSource 数据集各个分量的统计信息。
+
+|   数据集名称    | 样本个数 | 所占空间 | 过滤使用的 Teacher 模型 | 过滤后的样本个数 | 与 k200_val 中样本相似（疑似重复）的样本个数 |
+| :-------------: | :------: | :------: | :---------------------: | :--------------: | :------------------------------------------: |
+|   k200_train    |  76030   |  45.6G   |           N/A           |       N/A        |                     N/A                      |
+|    k200_val     |   4838   |   2.9G   |           N/A           |       N/A        |                     N/A                      |
+| googleimage_200 | 3050880  |  265.5G  |      TSN-R50-8seg       |     1188695      |                     967                      |
+|  insimage_200   | 3654650  |  224.4G  |      TSN-R50-8seg       |      879726      |                     116                      |
+|  insvideo_200   |  732855  | 1487.6G  |    SlowOnly-8x8-R50     |      330680      |                     956                      |
+| k200_raw_train  |  76027   |  963.5G  |    SlowOnly-8x8-R50     |       N/A        |                     N/A                      |
+
+MMAction2 所发布的 OmniSource 数据集目录结构如下所示：
+
+```
+OmniSource/
+├── annotations
+│   ├── googleimage_200
+│   │   ├── googleimage_200.txt                       从 Google 爬取到的所有图片列表
+│   │   ├── tsn_8seg_googleimage_200_duplicate.txt    从 Google 爬取到的，疑似与 k200-val 中样本重复的正样本列表
+│   │   ├── tsn_8seg_googleimage_200.txt              从 Google 爬取到的，经过 teacher 模型过滤的正样本列表
+│   │   └── tsn_8seg_googleimage_200_wodup.txt        从 Google 爬取到的，经过 teacher 模型过滤及去重的正样本列表
+│   ├── insimage_200
+│   │   ├── insimage_200.txt
+│   │   ├── tsn_8seg_insimage_200_duplicate.txt
+│   │   ├── tsn_8seg_insimage_200.txt
+│   │   └── tsn_8seg_insimage_200_wodup.txt
+│   ├── insvideo_200
+│   │   ├── insvideo_200.txt
+│   │   ├── slowonly_8x8_insvideo_200_duplicate.txt
+│   │   ├── slowonly_8x8_insvideo_200.txt
+│   │   └── slowonly_8x8_insvideo_200_wodup.txt
+│   ├── k200_actions.txt                              MiniKinetics 中 200 类动作的名称
+│   ├── K400_to_MiniKinetics_classidx_mapping.json    Kinetics 中的类索引至 MiniKinetics 中的类索引的映射
+│   ├── kinetics_200
+│   │   ├── k200_train.txt
+│   │   └── k200_val.txt
+│   └── kinetics_raw_200
+│       └── slowonly_8x8_kinetics_raw_200.json        经 teacher 模型过滤后的 Kinetics 原始视频片段
+├── googleimage_200                                   共 10 卷
+│   ├── vol_0.tar
+│   ├── ...
+│   └── vol_9.tar
+├── insimage_200                                      共 10 卷
+│   ├── vol_0.tar
+│   ├── ...
+│   └── vol_9.tar
+├── insvideo_200                                      共 20 卷
+│   ├── vol_00.tar
+│   ├── ...
+│   └── vol_19.tar
+├── kinetics_200_train
+│   └── kinetics_200_train.tar
+├── kinetics_200_val
+│   └── kinetics_200_val.tar
+└── kinetics_raw_200_train                            共 16 卷
+    ├── vol_0.tar
+    ├── ...
+    └── vol_15.tar
+```
+
+## 数据准备
+
+用户需要首先完成数据下载，对于 `kinetics_200` 和三个网络数据集 `googleimage_200`, `insimage_200`, `insvideo_200`，用户仅需解压各压缩卷并将其合并至一处。
+
+对于 Kinetics 原始视频，由于直接读取长视频非常耗时，用户需要先将其分割为小段。MMAction2 提供了名为 `trim_raw_video.py` 的脚本，用于将长视频分割至 10 秒的小段（分割完成后删除长视频）。用户可利用这一脚本分割长视频。
+
+所有数据应位于 `data/OmniSource/` 目录下。完成数据准备后，`data/OmniSource/` 目录的结构应如下所示（为简洁，省去了训练及测试时未使用的文件）：
+
+```
+data/OmniSource/
+├── annotations
+│   ├── googleimage_200
+│   │   └── tsn_8seg_googleimage_200_wodup.txt    Positive file list of images crawled from Google, filtered by the teacher model, after de-duplication.
+│   ├── insimage_200
+│   │   └── tsn_8seg_insimage_200_wodup.txt
+│   ├── insvideo_200
+│   │   └── slowonly_8x8_insvideo_200_wodup.txt
+│   ├── kinetics_200
+│   │   ├── k200_train.txt
+│   │   └── k200_val.txt
+│   ├── kinetics_raw_200
+│   │   └── slowonly_8x8_kinetics_raw_200.json    Kinetics Raw Clips filtered by the teacher model.
+│   └── webimage_200
+│       └── tsn_8seg_webimage_200_wodup.txt       The union of `tsn_8seg_googleimage_200_wodup.txt` and `tsn_8seg_insimage_200_wodup.txt`
+├── googleimage_200
+│   ├── 000
+|   │   ├── 00
+|   │   │   ├── 000001.jpg
+|   │   │   ├── ...
+|   │   │   └── 000901.jpg
+|   │   ├── ...
+|   │   ├── 19
+│   ├── ...
+│   └── 199
+├── insimage_200
+│   ├── 000
+|   │   ├── abseil
+|   │   │   ├── 1J9tKWCNgV_0.jpg
+|   │   │   ├── ...
+|   │   │   └── 1J9tKWCNgV_0.jpg
+|   │   ├── abseiling
+│   ├── ...
+│   └── 199
+├── insvideo_200
+│   ├── 000
+|   │   ├── abseil
+|   │   │   ├── B00arxogubl.mp4
+|   │   │   ├── ...
+|   │   │   └── BzYsP0HIvbt.mp4
+|   │   ├── abseiling
+│   ├── ...
+│   └── 199
+├── kinetics_200_train
+│   ├── 0074cdXclLU.mp4
+|   ├── ...
+|   ├── zzzlyL61Fyo.mp4
+├── kinetics_200_val
+│   ├── 01fAWEHzudA.mp4
+|   ├── ...
+|   ├── zymA_6jZIz4.mp4
+└── kinetics_raw_200_train
+│   ├── pref_
+│   |   ├── ___dTOdxzXY
+|   │   │   ├── part_0.mp4
+|   │   │   ├── ...
+|   │   │   ├── part_6.mp4
+│   |   ├── ...
+│   |   └── _zygwGDE2EM
+│   ├── ...
+│   └── prefZ
+```
diff --git a/tools/data/omnisource/trim_raw_video.py b/tools/data/omnisource/trim_raw_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4405f58e0771b26aaef24b10e662ac75af0f972
--- /dev/null
+++ b/tools/data/omnisource/trim_raw_video.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import sys
+from subprocess import check_output
+
+import mmengine
+
+
+def get_duration(vid_name):
+    command = f'ffprobe -i {vid_name} 2>&1 | grep "Duration"'
+    output = str(check_output(command, shell=True))
+    output = output.split(',')[0].split('Duration:')[1].strip()
+    h, m, s = output.split(':')
+    duration = int(h) * 3600 + int(m) * 60 + float(s)
+    return duration
+
+
+def trim(vid_name):
+    try:
+        lt = get_duration(vid_name)
+    except Exception:
+        print(f'get_duration failed for video {vid_name}', flush=True)
+        return
+
+    i = 0
+    name, _ = osp.splitext(vid_name)
+
+    # We output 10-second clips into the folder `name`
+    dest = name
+    mmengine.mkdir_or_exist(dest)
+
+    command_tmpl = ('ffmpeg -y loglevel error -i {} -ss {} -t {} -crf 18 '
+                    '-c:v libx264 {}/part_{}.mp4')
+    while i * 10 < lt:
+        os.system(command_tmpl.format(vid_name, i * 10, 10, dest, i))
+        i += 1
+
+    # remove a raw video after decomposing it into 10-second clip to save space
+    os.remove(vid_name)
+
+
+if __name__ == '__main__':
+    vid_name = sys.argv[1]
+    trim(vid_name)
diff --git a/tools/data/parse_file_list.py b/tools/data/parse_file_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecb4e6cecefe42eadb9efd436f5f4473b4d386b9
--- /dev/null
+++ b/tools/data/parse_file_list.py
@@ -0,0 +1,535 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import csv
+import fnmatch
+import glob
+import json
+import os
+import os.path as osp
+
+
+def parse_directory(path,
+                    rgb_prefix='img_',
+                    flow_x_prefix='flow_x_',
+                    flow_y_prefix='flow_y_',
+                    level=1):
+    """Parse directories holding extracted frames from standard benchmarks.
+
+    Args:
+        path (str): Directory path to parse frames.
+        rgb_prefix (str): Prefix of generated rgb frames name.
+            default: 'img_'.
+        flow_x_prefix (str): Prefix of generated flow x name.
+            default: `flow_x_`.
+        flow_y_prefix (str): Prefix of generated flow y name.
+            default: `flow_y_`.
+        level (int): Directory level for glob searching. Options are 1 and 2.
+            default: 1.
+
+    Returns:
+        dict: frame info dict with video id as key and tuple(path(str),
+            rgb_num(int), flow_x_num(int)) as value.
+    """
+    print(f'parse frames under directory {path}')
+    if level == 1:
+        # Only search for one-level directory
+        def locate_directory(x):
+            return osp.basename(x)
+
+        frame_dirs = glob.glob(osp.join(path, '*'))
+
+    elif level == 2:
+        # search for two-level directory
+        def locate_directory(x):
+            return osp.join(osp.basename(osp.dirname(x)), osp.basename(x))
+
+        frame_dirs = glob.glob(osp.join(path, '*', '*'))
+
+    else:
+        raise ValueError('level can be only 1 or 2')
+
+    def count_files(directory, prefix_list):
+        """Count file number with a given directory and prefix.
+
+        Args:
+            directory (str): Data directory to be search.
+            prefix_list (list): List or prefix.
+
+        Returns:
+            list (int): Number list of the file with the prefix.
+        """
+        lst = os.listdir(directory)
+        cnt_list = [len(fnmatch.filter(lst, x + '*')) for x in prefix_list]
+        return cnt_list
+
+    # check RGB
+    frame_dict = {}
+    for i, frame_dir in enumerate(frame_dirs):
+        total_num = count_files(frame_dir,
+                                (rgb_prefix, flow_x_prefix, flow_y_prefix))
+        dir_name = locate_directory(frame_dir)
+
+        num_x = total_num[1]
+        num_y = total_num[2]
+        if num_x != num_y:
+            raise ValueError(f'x and y direction have different number '
+                             f'of flow images in video directory: {frame_dir}')
+        if i % 200 == 0:
+            print(f'{i} videos parsed')
+        frame_dict[dir_name] = (frame_dir, total_num[0], num_x)
+
+    print('frame directory analysis done')
+    return frame_dict
+
+
+def parse_ucf101_splits(level):
+    """Parse UCF-101 dataset into "train", "val", "test" splits.
+
+    Args:
+        level (int): Directory level of data. 1 for the single-level directory,
+            2 for the two-level directory.
+
+    Returns:
+        list: "train", "val", "test" splits of UCF-101.
+    """
+    class_index_file = 'data/ucf101/annotations/classInd.txt'
+    train_file_template = 'data/ucf101/annotations/trainlist{:02d}.txt'
+    test_file_template = 'data/ucf101/annotations/testlist{:02d}.txt'
+
+    with open(class_index_file, 'r') as fin:
+        class_index = [x.strip().split() for x in fin]
+    class_mapping = {x[1]: int(x[0]) - 1 for x in class_index}
+
+    def line_to_map(line):
+        """A function to map line string to video and label.
+
+        Args:
+            line (str): A long directory path, which is a text path.
+
+        Returns:
+            tuple[str, str]: (video, label), video is the video id,
+                label is the video label.
+        """
+        items = line.strip().split()
+        video = osp.splitext(items[0])[0]
+        if level == 1:
+            video = osp.basename(video)
+            label = items[0]
+        elif level == 2:
+            video = osp.join(
+                osp.basename(osp.dirname(video)), osp.basename(video))
+            label = class_mapping[osp.dirname(items[0])]
+        return video, label
+
+    splits = []
+    for i in range(1, 4):
+        with open(train_file_template.format(i), 'r') as fin:
+            train_list = [line_to_map(x) for x in fin]
+
+        with open(test_file_template.format(i), 'r') as fin:
+            test_list = [line_to_map(x) for x in fin]
+        splits.append((train_list, test_list))
+
+    return splits
+
+
+def parse_jester_splits(level):
+    """Parse Jester into "train", "val" splits.
+
+    Args:
+        level (int): Directory level of data. 1 for the single-level directory,
+            2 for the two-level directory.
+
+    Returns:
+        list: "train", "val", "test" splits of Jester dataset.
+    """
+    # Read the annotations
+    class_index_file = 'data/jester/annotations/jester-v1-labels.csv'
+    train_file = 'data/jester/annotations/jester-v1-train.csv'
+    val_file = 'data/jester/annotations/jester-v1-validation.csv'
+    test_file = 'data/jester/annotations/jester-v1-test.csv'
+
+    with open(class_index_file, 'r') as fin:
+        class_index = [x.strip() for x in fin]
+    class_mapping = {class_index[idx]: idx for idx in range(len(class_index))}
+
+    def line_to_map(line, test_mode=False):
+        items = line.strip().split(';')
+        video = items[0]
+        if level == 1:
+            video = osp.basename(video)
+        elif level == 2:
+            video = osp.join(
+                osp.basename(osp.dirname(video)), osp.basename(video))
+        if test_mode:
+            return video
+
+        label = class_mapping[items[1]]
+        return video, label
+
+    with open(train_file, 'r') as fin:
+        train_list = [line_to_map(x) for x in fin]
+
+    with open(val_file, 'r') as fin:
+        val_list = [line_to_map(x) for x in fin]
+
+    with open(test_file, 'r') as fin:
+        test_list = [line_to_map(x, test_mode=True) for x in fin]
+
+    splits = ((train_list, val_list, test_list), )
+    return splits
+
+
+def parse_sthv1_splits(level):
+    """Parse Something-Something dataset V1 into "train", "val" splits.
+
+    Args:
+        level (int): Directory level of data. 1 for the single-level directory,
+            2 for the two-level directory.
+
+    Returns:
+        list: "train", "val", "test" splits of Something-Something V1 dataset.
+    """
+    # Read the annotations
+    # yapf: disable
+    class_index_file = 'data/sthv1/annotations/something-something-v1-labels.csv'  # noqa
+    # yapf: enable
+    train_file = 'data/sthv1/annotations/something-something-v1-train.csv'
+    val_file = 'data/sthv1/annotations/something-something-v1-validation.csv'
+    test_file = 'data/sthv1/annotations/something-something-v1-test.csv'
+
+    with open(class_index_file, 'r') as fin:
+        class_index = [x.strip() for x in fin]
+    class_mapping = {class_index[idx]: idx for idx in range(len(class_index))}
+
+    def line_to_map(line, test_mode=False):
+        items = line.strip().split(';')
+        video = items[0]
+        if level == 1:
+            video = osp.basename(video)
+        elif level == 2:
+            video = osp.join(
+                osp.basename(osp.dirname(video)), osp.basename(video))
+        if test_mode:
+            return video
+
+        label = class_mapping[items[1]]
+        return video, label
+
+    with open(train_file, 'r') as fin:
+        train_list = [line_to_map(x) for x in fin]
+
+    with open(val_file, 'r') as fin:
+        val_list = [line_to_map(x) for x in fin]
+
+    with open(test_file, 'r') as fin:
+        test_list = [line_to_map(x, test_mode=True) for x in fin]
+
+    splits = ((train_list, val_list, test_list), )
+    return splits
+
+
+def parse_sthv2_splits(level):
+    """Parse Something-Something dataset V2 into "train", "val" splits.
+
+    Args:
+        level (int): Directory level of data. 1 for the single-level directory,
+            2 for the two-level directory.
+
+    Returns:
+        list: "train", "val", "test" splits of Something-Something V2 dataset.
+    """
+    # Read the annotations
+    # yapf: disable
+    class_index_file = 'data/sthv2/annotations/something-something-v2-labels.json'  # noqa
+    # yapf: enable
+    train_file = 'data/sthv2/annotations/something-something-v2-train.json'
+    val_file = 'data/sthv2/annotations/something-something-v2-validation.json'
+    test_file = 'data/sthv2/annotations/something-something-v2-test.json'
+
+    with open(class_index_file, 'r') as fin:
+        class_mapping = json.loads(fin.read())
+
+    def line_to_map(item, test_mode=False):
+        video = item['id']
+        if level == 1:
+            video = osp.basename(video)
+        elif level == 2:
+            video = osp.join(
+                osp.basename(osp.dirname(video)), osp.basename(video))
+        if test_mode:
+            return video
+
+        template = item['template'].replace('[', '')
+        template = template.replace(']', '')
+        label = int(class_mapping[template])
+        return video, label
+
+    with open(train_file, 'r') as fin:
+        items = json.loads(fin.read())
+        train_list = [line_to_map(item) for item in items]
+
+    with open(val_file, 'r') as fin:
+        items = json.loads(fin.read())
+        val_list = [line_to_map(item) for item in items]
+
+    with open(test_file, 'r') as fin:
+        items = json.loads(fin.read())
+        test_list = [line_to_map(item, test_mode=True) for item in items]
+
+    splits = ((train_list, val_list, test_list), )
+    return splits
+
+
+def parse_mmit_splits():
+    """Parse Multi-Moments in Time dataset into "train", "val" splits.
+
+    Returns:
+        list: "train", "val", "test" splits of Multi-Moments in Time.
+    """
+
+    # Read the annotations
+    def line_to_map(x):
+        video = osp.splitext(x[0])[0]
+        labels = [int(digit) for digit in x[1:]]
+        return video, labels
+
+    csv_reader = csv.reader(open('data/mmit/annotations/trainingSet.csv'))
+    train_list = [line_to_map(x) for x in csv_reader]
+
+    csv_reader = csv.reader(open('data/mmit/annotations/validationSet.csv'))
+    val_list = [line_to_map(x) for x in csv_reader]
+
+    test_list = val_list  # not test for mit
+
+    splits = ((train_list, val_list, test_list), )
+    return splits
+
+
+def parse_kinetics_splits(level, dataset):
+    """Parse Kinetics dataset into "train", "val", "test" splits.
+
+    Args:
+        level (int): Directory level of data. 1 for the single-level directory,
+            2 for the two-level directory.
+        dataset (str): Denotes the version of Kinetics that needs to be parsed,
+            choices are "kinetics400", "kinetics600" and "kinetics700".
+
+    Returns:
+        list: "train", "val", "test" splits of Kinetics.
+    """
+
+    def convert_label(s, keep_whitespaces=False):
+        """Convert label name to a formal string.
+
+        Remove redundant '"' and convert whitespace to '_'.
+
+        Args:
+            s (str): String to be converted.
+            keep_whitespaces(bool): Whether to keep whitespace. Default: False.
+
+        Returns:
+            str: Converted string.
+        """
+        if not keep_whitespaces:
+            return s.replace('"', '').replace(' ', '_')
+
+        return s.replace('"', '')
+
+    def line_to_map(x, test=False):
+        """A function to map line string to video and label.
+
+        Args:
+            x (str): A single line from Kinetics csv file.
+            test (bool): Indicate whether the line comes from test
+                annotation file.
+
+        Returns:
+            tuple[str, str]: (video, label), video is the video id,
+                label is the video label.
+        """
+        if test:
+            # video = f'{x[0]}_{int(x[1]):06d}_{int(x[2]):06d}'
+            video = f'{x[1]}_{int(float(x[2])):06d}_{int(float(x[3])):06d}'
+            label = -1  # label unknown
+            return video, label
+
+        video = f'{x[1]}_{int(float(x[2])):06d}_{int(float(x[3])):06d}'
+        if level == 2:
+            video = f'{convert_label(x[0])}/{video}'
+        else:
+            assert level == 1
+        label = class_mapping[convert_label(x[0])]
+        return video, label
+
+    train_file = f'data/{dataset}/annotations/kinetics_train.csv'
+    val_file = f'data/{dataset}/annotations/kinetics_val.csv'
+    test_file = f'data/{dataset}/annotations/kinetics_test.csv'
+
+    csv_reader = csv.reader(open(train_file))
+    # skip the first line
+    next(csv_reader)
+
+    labels_sorted = sorted({convert_label(row[0]) for row in csv_reader})
+    class_mapping = {label: i for i, label in enumerate(labels_sorted)}
+
+    csv_reader = csv.reader(open(train_file))
+    next(csv_reader)
+    train_list = [line_to_map(x) for x in csv_reader]
+
+    csv_reader = csv.reader(open(val_file))
+    next(csv_reader)
+    val_list = [line_to_map(x) for x in csv_reader]
+
+    csv_reader = csv.reader(open(test_file))
+    next(csv_reader)
+    test_list = [line_to_map(x, test=True) for x in csv_reader]
+
+    splits = ((train_list, val_list, test_list), )
+    return splits
+
+
+def parse_mit_splits():
+    """Parse Moments in Time dataset into "train", "val" splits.
+
+    Returns:
+        list: "train", "val", "test" splits of Moments in Time.
+    """
+    # Read the annotations
+    class_mapping = {}
+    with open('data/mit/annotations/moments_categories.txt') as f_cat:
+        for line in f_cat.readlines():
+            cat, digit = line.rstrip().split(',')
+            class_mapping[cat] = int(digit)
+
+    def line_to_map(x):
+        video = osp.splitext(x[0])[0]
+        label = class_mapping[osp.dirname(x[0])]
+        return video, label
+
+    csv_reader = csv.reader(open('data/mit/annotations/trainingSet.csv'))
+    train_list = [line_to_map(x) for x in csv_reader]
+
+    csv_reader = csv.reader(open('data/mit/annotations/validationSet.csv'))
+    val_list = [line_to_map(x) for x in csv_reader]
+
+    test_list = val_list  # no test for mit
+
+    splits = ((train_list, val_list, test_list), )
+    return splits
+
+
+def parse_hmdb51_split(level):
+    train_file_template = 'data/hmdb51/annotations/trainlist{:02d}.txt'
+    test_file_template = 'data/hmdb51/annotations/testlist{:02d}.txt'
+    class_index_file = 'data/hmdb51/annotations/classInd.txt'
+
+    def generate_class_index_file():
+        """This function will generate a `ClassInd.txt` for HMDB51 in a format
+        like UCF101, where class id starts with 1."""
+        video_path = 'data/hmdb51/videos'
+        annotation_dir = 'data/hmdb51/annotations'
+
+        class_list = sorted(os.listdir(video_path))
+        class_dict = dict()
+        if not osp.exists(class_index_file):
+            with open(class_index_file, 'w') as f:
+                content = []
+                for class_id, class_name in enumerate(class_list):
+                    # like `ClassInd.txt` in UCF-101,
+                    # the class_id begins with 1
+                    class_dict[class_name] = class_id + 1
+                    cur_line = ' '.join([str(class_id + 1), class_name])
+                    content.append(cur_line)
+                content = '\n'.join(content)
+                f.write(content)
+        else:
+            print(f'{class_index_file} has been generated before.')
+            class_dict = {
+                class_name: class_id + 1
+                for class_id, class_name in enumerate(class_list)
+            }
+
+        for i in range(1, 4):
+            train_content = []
+            test_content = []
+            for class_name in class_dict:
+                filename = class_name + f'_test_split{i}.txt'
+                filename_path = osp.join(annotation_dir, filename)
+                with open(filename_path, 'r') as fin:
+                    for line in fin:
+                        video_info = line.strip().split()
+                        video_name = video_info[0]
+                        if video_info[1] == '1':
+                            target_line = ' '.join([
+                                osp.join(class_name, video_name),
+                                str(class_dict[class_name])
+                            ])
+                            train_content.append(target_line)
+                        elif video_info[1] == '2':
+                            target_line = ' '.join([
+                                osp.join(class_name, video_name),
+                                str(class_dict[class_name])
+                            ])
+                            test_content.append(target_line)
+            train_content = '\n'.join(train_content)
+            test_content = '\n'.join(test_content)
+            with open(train_file_template.format(i), 'w') as fout:
+                fout.write(train_content)
+            with open(test_file_template.format(i), 'w') as fout:
+                fout.write(test_content)
+
+    generate_class_index_file()
+
+    with open(class_index_file, 'r') as fin:
+        class_index = [x.strip().split() for x in fin]
+    class_mapping = {x[1]: int(x[0]) - 1 for x in class_index}
+
+    def line_to_map(line):
+        items = line.strip().split()
+        video = osp.splitext(items[0])[0]
+        if level == 1:
+            video = osp.basename(video)
+        elif level == 2:
+            video = osp.join(
+                osp.basename(osp.dirname(video)), osp.basename(video))
+        label = class_mapping[osp.dirname(items[0])]
+        return video, label
+
+    splits = []
+    for i in range(1, 4):
+        with open(train_file_template.format(i), 'r') as fin:
+            train_list = [line_to_map(x) for x in fin]
+
+        with open(test_file_template.format(i), 'r') as fin:
+            test_list = [line_to_map(x) for x in fin]
+        splits.append((train_list, test_list))
+
+    return splits
+
+
+def parse_diving48_splits():
+
+    train_file = 'data/diving48/annotations/Diving48_V2_train.json'
+    test_file = 'data/diving48/annotations/Diving48_V2_test.json'
+
+    train = json.load(open(train_file))
+    test = json.load(open(test_file))
+
+    # class_index_file = 'data/diving48/annotations/Diving48_vocab.json'
+    # class_list = json.load(open(class_index_file))
+
+    train_list = []
+    test_list = []
+
+    for item in train:
+        vid_name = item['vid_name']
+        label = item['label']
+        train_list.append((vid_name, label))
+
+    for item in test:
+        vid_name = item['vid_name']
+        label = item['label']
+        test_list.append((vid_name, label))
+
+    splits = ((train_list, test_list), )
+    return splits
diff --git a/tools/data/resize_videos.py b/tools/data/resize_videos.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec1170040a09f62944406741a8742331a9a74aba
--- /dev/null
+++ b/tools/data/resize_videos.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import os
+import os.path as osp
+import sys
+from multiprocessing import Pool
+
+
+def resize_videos(vid_item):
+    """Generate resized video cache.
+
+    Args:
+        vid_item (list): Video item containing video full path,
+            video relative path.
+
+    Returns:
+        bool: Whether generate video cache successfully.
+    """
+    full_path, vid_path = vid_item
+    # Change the output video extension to .mp4 if '--to-mp4' flag is set
+    if args.to_mp4:
+        vid_path = vid_path.split('.')
+        assert len(vid_path) == 2, \
+            f"Video path '{vid_path}' contain more than one dot"
+        vid_path = vid_path[0] + '.mp4'
+    out_full_path = osp.join(args.out_dir, vid_path)
+    dir_name = osp.dirname(vid_path)
+    out_dir = osp.join(args.out_dir, dir_name)
+    if not osp.exists(out_dir):
+        os.makedirs(out_dir)
+    result = os.popen(
+        f'ffprobe -hide_banner -loglevel error -select_streams v:0 -show_entries stream=width,height -of csv=p=0 {full_path}'  # noqa:E501
+    )
+    w, h = [int(d) for d in result.readline().rstrip().split(',')]
+    if w > h:
+        cmd = (f'ffmpeg -hide_banner -loglevel error -i {full_path} '
+               f'-vf {"mpdecimate," if args.remove_dup else ""}'
+               f'scale=-2:{args.scale} '
+               f'{"-vsync vfr" if args.remove_dup else ""} '
+               f'-c:v libx264 {"-g 16" if args.dense else ""} '
+               f'-an {out_full_path} -y')
+    else:
+        cmd = (f'ffmpeg -hide_banner -loglevel error -i {full_path} '
+               f'-vf {"mpdecimate," if args.remove_dup else ""}'
+               f'scale={args.scale}:-2 '
+               f'{"-vsync vfr" if args.remove_dup else ""} '
+               f'-c:v libx264 {"-g 16" if args.dense else ""} '
+               f'-an {out_full_path} -y')
+    os.popen(cmd)
+    print(f'{vid_path} done')
+    sys.stdout.flush()
+    return True
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate the resized cache of original videos')
+    parser.add_argument('src_dir', type=str, help='source video directory')
+    parser.add_argument('out_dir', type=str, help='output video directory')
+    parser.add_argument(
+        '--dense',
+        action='store_true',
+        help='whether to generate a faster cache')
+    parser.add_argument(
+        '--level',
+        type=int,
+        choices=[1, 2],
+        default=2,
+        help='directory level of data')
+    parser.add_argument(
+        '--remove-dup',
+        action='store_true',
+        help='whether to remove duplicated frames')
+    parser.add_argument(
+        '--ext',
+        type=str,
+        default='mp4',
+        choices=['avi', 'mp4', 'webm', 'mkv'],
+        help='video file extensions')
+    parser.add_argument(
+        '--to-mp4',
+        action='store_true',
+        help='whether to output videos in mp4 format')
+    parser.add_argument(
+        '--scale',
+        type=int,
+        default=256,
+        help='resize image short side length keeping ratio')
+    parser.add_argument(
+        '--num-worker', type=int, default=8, help='number of workers')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if not osp.isdir(args.out_dir):
+        print(f'Creating folder: {args.out_dir}')
+        os.makedirs(args.out_dir)
+
+    print('Reading videos from folder: ', args.src_dir)
+    print('Extension of videos: ', args.ext)
+    fullpath_list = glob.glob(args.src_dir + '/*' * args.level + '.' +
+                              args.ext)
+    done_fullpath_list = glob.glob(args.out_dir + '/*' * args.level + args.ext)
+    print('Total number of videos found: ', len(fullpath_list))
+    print('Total number of videos transfer finished: ',
+          len(done_fullpath_list))
+    if args.level == 2:
+        vid_list = list(
+            map(
+                lambda p: osp.join(
+                    osp.basename(osp.dirname(p)), osp.basename(p)),
+                fullpath_list))
+    elif args.level == 1:
+        vid_list = list(map(osp.basename, fullpath_list))
+    pool = Pool(args.num_worker)
+    pool.map(resize_videos, zip(fullpath_list, vid_list))
diff --git a/tools/data/skeleton/NTU_RGBD120_samples_with_missing_skeletons.txt b/tools/data/skeleton/NTU_RGBD120_samples_with_missing_skeletons.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d321ffdfe593581136c912d7c6eef195f73e7046
--- /dev/null
+++ b/tools/data/skeleton/NTU_RGBD120_samples_with_missing_skeletons.txt
@@ -0,0 +1,535 @@
+S001C002P005R002A008
+S001C002P006R001A008
+S001C003P002R001A055
+S001C003P002R002A012
+S001C003P005R002A004
+S001C003P005R002A005
+S001C003P005R002A006
+S001C003P006R002A008
+S002C002P011R002A030
+S002C003P008R001A020
+S002C003P010R002A010
+S002C003P011R002A007
+S002C003P011R002A011
+S002C003P014R002A007
+S003C001P019R001A055
+S003C002P002R002A055
+S003C002P018R002A055
+S003C003P002R001A055
+S003C003P016R001A055
+S003C003P018R002A024
+S004C002P003R001A013
+S004C002P008R001A009
+S004C002P020R001A003
+S004C002P020R001A004
+S004C002P020R001A012
+S004C002P020R001A020
+S004C002P020R001A021
+S004C002P020R001A036
+S005C002P004R001A001
+S005C002P004R001A003
+S005C002P010R001A016
+S005C002P010R001A017
+S005C002P010R001A048
+S005C002P010R001A049
+S005C002P016R001A009
+S005C002P016R001A010
+S005C002P018R001A003
+S005C002P018R001A028
+S005C002P018R001A029
+S005C003P016R002A009
+S005C003P018R002A013
+S005C003P021R002A057
+S006C001P001R002A055
+S006C002P007R001A005
+S006C002P007R001A006
+S006C002P016R001A043
+S006C002P016R001A051
+S006C002P016R001A052
+S006C002P022R001A012
+S006C002P023R001A020
+S006C002P023R001A021
+S006C002P023R001A022
+S006C002P023R001A023
+S006C002P024R001A018
+S006C002P024R001A019
+S006C003P001R002A013
+S006C003P007R002A009
+S006C003P007R002A010
+S006C003P007R002A025
+S006C003P016R001A060
+S006C003P017R001A055
+S006C003P017R002A013
+S006C003P017R002A014
+S006C003P017R002A015
+S006C003P022R002A013
+S007C001P018R002A050
+S007C001P025R002A051
+S007C001P028R001A050
+S007C001P028R001A051
+S007C001P028R001A052
+S007C002P008R002A008
+S007C002P015R002A055
+S007C002P026R001A008
+S007C002P026R001A009
+S007C002P026R001A010
+S007C002P026R001A011
+S007C002P026R001A012
+S007C002P026R001A050
+S007C002P027R001A011
+S007C002P027R001A013
+S007C002P028R002A055
+S007C003P007R001A002
+S007C003P007R001A004
+S007C003P019R001A060
+S007C003P027R002A001
+S007C003P027R002A002
+S007C003P027R002A003
+S007C003P027R002A004
+S007C003P027R002A005
+S007C003P027R002A006
+S007C003P027R002A007
+S007C003P027R002A008
+S007C003P027R002A009
+S007C003P027R002A010
+S007C003P027R002A011
+S007C003P027R002A012
+S007C003P027R002A013
+S008C002P001R001A009
+S008C002P001R001A010
+S008C002P001R001A014
+S008C002P001R001A015
+S008C002P001R001A016
+S008C002P001R001A018
+S008C002P001R001A019
+S008C002P008R002A059
+S008C002P025R001A060
+S008C002P029R001A004
+S008C002P031R001A005
+S008C002P031R001A006
+S008C002P032R001A018
+S008C002P034R001A018
+S008C002P034R001A019
+S008C002P035R001A059
+S008C002P035R002A002
+S008C002P035R002A005
+S008C003P007R001A009
+S008C003P007R001A016
+S008C003P007R001A017
+S008C003P007R001A018
+S008C003P007R001A019
+S008C003P007R001A020
+S008C003P007R001A021
+S008C003P007R001A022
+S008C003P007R001A023
+S008C003P007R001A025
+S008C003P007R001A026
+S008C003P007R001A028
+S008C003P007R001A029
+S008C003P007R002A003
+S008C003P008R002A050
+S008C003P025R002A002
+S008C003P025R002A011
+S008C003P025R002A012
+S008C003P025R002A016
+S008C003P025R002A020
+S008C003P025R002A022
+S008C003P025R002A023
+S008C003P025R002A030
+S008C003P025R002A031
+S008C003P025R002A032
+S008C003P025R002A033
+S008C003P025R002A049
+S008C003P025R002A060
+S008C003P031R001A001
+S008C003P031R002A004
+S008C003P031R002A014
+S008C003P031R002A015
+S008C003P031R002A016
+S008C003P031R002A017
+S008C003P032R002A013
+S008C003P033R002A001
+S008C003P033R002A011
+S008C003P033R002A012
+S008C003P034R002A001
+S008C003P034R002A012
+S008C003P034R002A022
+S008C003P034R002A023
+S008C003P034R002A024
+S008C003P034R002A044
+S008C003P034R002A045
+S008C003P035R002A016
+S008C003P035R002A017
+S008C003P035R002A018
+S008C003P035R002A019
+S008C003P035R002A020
+S008C003P035R002A021
+S009C002P007R001A001
+S009C002P007R001A003
+S009C002P007R001A014
+S009C002P008R001A014
+S009C002P015R002A050
+S009C002P016R001A002
+S009C002P017R001A028
+S009C002P017R001A029
+S009C003P017R002A030
+S009C003P025R002A054
+S010C001P007R002A020
+S010C002P016R002A055
+S010C002P017R001A005
+S010C002P017R001A018
+S010C002P017R001A019
+S010C002P019R001A001
+S010C002P025R001A012
+S010C003P007R002A043
+S010C003P008R002A003
+S010C003P016R001A055
+S010C003P017R002A055
+S011C001P002R001A008
+S011C001P018R002A050
+S011C002P008R002A059
+S011C002P016R002A055
+S011C002P017R001A020
+S011C002P017R001A021
+S011C002P018R002A055
+S011C002P027R001A009
+S011C002P027R001A010
+S011C002P027R001A037
+S011C003P001R001A055
+S011C003P002R001A055
+S011C003P008R002A012
+S011C003P015R001A055
+S011C003P016R001A055
+S011C003P019R001A055
+S011C003P025R001A055
+S011C003P028R002A055
+S012C001P019R001A060
+S012C001P019R002A060
+S012C002P015R001A055
+S012C002P017R002A012
+S012C002P025R001A060
+S012C003P008R001A057
+S012C003P015R001A055
+S012C003P015R002A055
+S012C003P016R001A055
+S012C003P017R002A055
+S012C003P018R001A055
+S012C003P018R001A057
+S012C003P019R002A011
+S012C003P019R002A012
+S012C003P025R001A055
+S012C003P027R001A055
+S012C003P027R002A009
+S012C003P028R001A035
+S012C003P028R002A055
+S013C001P015R001A054
+S013C001P017R002A054
+S013C001P018R001A016
+S013C001P028R001A040
+S013C002P015R001A054
+S013C002P017R002A054
+S013C002P028R001A040
+S013C003P008R002A059
+S013C003P015R001A054
+S013C003P017R002A054
+S013C003P025R002A022
+S013C003P027R001A055
+S013C003P028R001A040
+S014C001P027R002A040
+S014C002P015R001A003
+S014C002P019R001A029
+S014C002P025R002A059
+S014C002P027R002A040
+S014C002P039R001A050
+S014C003P007R002A059
+S014C003P015R002A055
+S014C003P019R002A055
+S014C003P025R001A048
+S014C003P027R002A040
+S015C001P008R002A040
+S015C001P016R001A055
+S015C001P017R001A055
+S015C001P017R002A055
+S015C002P007R001A059
+S015C002P008R001A003
+S015C002P008R001A004
+S015C002P008R002A040
+S015C002P015R001A002
+S015C002P016R001A001
+S015C002P016R002A055
+S015C003P008R002A007
+S015C003P008R002A011
+S015C003P008R002A012
+S015C003P008R002A028
+S015C003P008R002A040
+S015C003P025R002A012
+S015C003P025R002A017
+S015C003P025R002A020
+S015C003P025R002A021
+S015C003P025R002A030
+S015C003P025R002A033
+S015C003P025R002A034
+S015C003P025R002A036
+S015C003P025R002A037
+S015C003P025R002A044
+S016C001P019R002A040
+S016C001P025R001A011
+S016C001P025R001A012
+S016C001P025R001A060
+S016C001P040R001A055
+S016C001P040R002A055
+S016C002P008R001A011
+S016C002P019R002A040
+S016C002P025R002A012
+S016C003P008R001A011
+S016C003P008R002A002
+S016C003P008R002A003
+S016C003P008R002A004
+S016C003P008R002A006
+S016C003P008R002A009
+S016C003P019R002A040
+S016C003P039R002A016
+S017C001P016R002A031
+S017C002P007R001A013
+S017C002P008R001A009
+S017C002P015R001A042
+S017C002P016R002A031
+S017C002P016R002A055
+S017C003P007R002A013
+S017C003P008R001A059
+S017C003P016R002A031
+S017C003P017R001A055
+S017C003P020R001A059
+S019C001P046R001A075
+S019C002P042R001A094
+S019C002P042R001A095
+S019C002P042R001A096
+S019C002P042R001A097
+S019C002P042R001A098
+S019C002P042R001A099
+S019C002P042R001A100
+S019C002P042R001A101
+S019C002P042R001A102
+S019C002P049R002A074
+S019C002P049R002A079
+S019C002P051R001A061
+S019C003P046R001A061
+S019C003P046R002A061
+S019C003P046R002A062
+S020C002P041R001A063
+S020C002P041R001A064
+S020C002P044R001A063
+S020C002P044R001A064
+S020C002P044R001A066
+S020C002P044R001A084
+S020C002P054R001A081
+S021C001P059R001A108
+S021C002P055R001A065
+S021C002P055R001A092
+S021C002P055R001A093
+S021C002P057R001A064
+S021C002P058R001A063
+S021C002P058R001A064
+S021C002P059R001A074
+S021C002P059R001A075
+S021C002P059R001A076
+S021C002P059R001A077
+S021C002P059R001A078
+S021C002P059R001A079
+S021C003P057R002A078
+S021C003P057R002A079
+S021C003P057R002A094
+S022C002P061R001A113
+S022C003P061R002A061
+S022C003P061R002A062
+S022C003P063R002A061
+S022C003P063R002A062
+S022C003P063R002A063
+S022C003P063R002A064
+S022C003P063R002A078
+S022C003P064R002A061
+S022C003P064R002A062
+S022C003P065R002A061
+S022C003P065R002A062
+S022C003P065R002A119
+S022C003P067R002A064
+S023C002P055R001A114
+S023C002P055R002A092
+S023C002P059R001A075
+S023C002P063R001A075
+S023C003P055R002A093
+S023C003P055R002A094
+S023C003P061R002A061
+S023C003P064R001A092
+S024C001P063R001A109
+S024C002P062R002A074
+S024C002P067R001A100
+S024C002P067R001A101
+S024C002P067R001A102
+S024C002P067R001A103
+S024C003P062R002A074
+S024C003P063R002A061
+S024C003P063R002A062
+S025C001P055R002A119
+S025C003P056R002A119
+S025C003P059R002A115
+S026C002P044R001A061
+S026C002P044R001A062
+S026C002P070R001A092
+S026C003P069R002A075
+S026C003P074R002A061
+S026C003P074R002A062
+S026C003P075R001A117
+S026C003P075R001A118
+S027C001P082R001A063
+S027C002P044R002A092
+S027C002P079R001A061
+S027C002P079R001A062
+S027C002P079R001A063
+S027C002P079R001A064
+S027C002P082R001A092
+S027C002P084R001A061
+S027C002P084R001A062
+S027C002P086R001A061
+S027C003P041R002A087
+S027C003P080R002A061
+S027C003P082R002A061
+S027C003P082R002A062
+S027C003P086R002A061
+S027C003P086R002A062
+S028C001P087R001A061
+S028C002P041R001A091
+S028C002P087R001A061
+S028C003P042R002A064
+S028C003P046R002A063
+S028C003P046R002A066
+S028C003P046R002A067
+S028C003P046R002A068
+S028C003P046R002A069
+S028C003P046R002A070
+S028C003P046R002A071
+S028C003P046R002A072
+S028C003P046R002A074
+S028C003P046R002A075
+S028C003P046R002A077
+S028C003P046R002A081
+S028C003P046R002A082
+S028C003P046R002A083
+S028C003P046R002A084
+S028C003P048R002A061
+S028C003P048R002A062
+S028C003P048R002A073
+S028C003P073R002A073
+S028C003P087R001A061
+S028C003P087R002A061
+S028C003P087R002A062
+S029C001P043R002A092
+S029C001P044R002A092
+S029C001P048R001A073
+S029C001P089R001A063
+S029C002P041R001A074
+S029C002P041R001A084
+S029C002P044R001A091
+S029C002P048R001A075
+S029C002P048R001A081
+S029C002P074R001A081
+S029C002P074R001A095
+S029C002P074R001A096
+S029C002P080R001A091
+S029C002P088R001A066
+S029C002P089R001A065
+S029C002P090R001A067
+S029C003P008R002A065
+S029C003P008R002A067
+S029C003P041R001A089
+S029C003P043R001A080
+S029C003P043R001A092
+S029C003P043R001A105
+S029C003P043R002A085
+S029C003P043R002A086
+S029C003P044R002A106
+S029C003P048R001A065
+S029C003P048R002A073
+S029C003P048R002A074
+S029C003P048R002A075
+S029C003P048R002A076
+S029C003P048R002A092
+S029C003P048R002A094
+S029C003P051R002A073
+S029C003P051R002A074
+S029C003P051R002A075
+S029C003P051R002A076
+S029C003P051R002A077
+S029C003P051R002A078
+S029C003P051R002A079
+S029C003P051R002A080
+S029C003P051R002A081
+S029C003P051R002A082
+S029C003P051R002A083
+S029C003P051R002A084
+S029C003P051R002A085
+S029C003P051R002A086
+S029C003P051R002A110
+S029C003P067R001A098
+S029C003P074R002A110
+S029C003P080R002A066
+S029C003P088R002A078
+S029C003P089R001A075
+S029C003P089R002A061
+S029C003P089R002A062
+S029C003P089R002A063
+S029C003P090R002A092
+S029C003P090R002A095
+S030C002P091R002A091
+S030C002P091R002A092
+S030C002P091R002A093
+S030C002P091R002A094
+S030C002P091R002A095
+S030C002P091R002A096
+S030C002P091R002A097
+S030C002P091R002A098
+S030C002P091R002A099
+S030C002P091R002A100
+S030C002P091R002A101
+S030C002P091R002A102
+S030C002P091R002A103
+S030C002P091R002A104
+S030C002P091R002A105
+S030C003P044R002A065
+S030C003P044R002A081
+S030C003P044R002A084
+S031C002P042R001A111
+S031C002P051R001A061
+S031C002P051R001A062
+S031C002P067R001A067
+S031C002P067R001A068
+S031C002P067R001A069
+S031C002P067R001A070
+S031C002P067R001A071
+S031C002P067R001A072
+S031C002P082R001A075
+S031C002P082R002A117
+S031C002P097R001A061
+S031C002P097R001A062
+S031C003P043R002A074
+S031C003P043R002A075
+S031C003P044R002A094
+S031C003P082R002A067
+S031C003P082R002A068
+S031C003P082R002A069
+S031C003P082R002A070
+S031C003P082R002A071
+S031C003P082R002A072
+S031C003P082R002A073
+S031C003P082R002A075
+S031C003P082R002A076
+S031C003P082R002A077
+S031C003P082R002A084
+S031C003P082R002A085
+S031C003P082R002A086
+S032C002P067R001A092
+S032C003P067R002A066
+S032C003P067R002A067
+S032C003P067R002A075
+S032C003P067R002A076
+S032C003P067R002A077
diff --git a/tools/data/skeleton/NTU_RGBD_samples_with_missing_skeletons.txt b/tools/data/skeleton/NTU_RGBD_samples_with_missing_skeletons.txt
new file mode 100644
index 0000000000000000000000000000000000000000..375050d46a665ffcb2c165e1d84c6fb2405674f2
--- /dev/null
+++ b/tools/data/skeleton/NTU_RGBD_samples_with_missing_skeletons.txt
@@ -0,0 +1,302 @@
+S001C002P005R002A008
+S001C002P006R001A008
+S001C003P002R001A055
+S001C003P002R002A012
+S001C003P005R002A004
+S001C003P005R002A005
+S001C003P005R002A006
+S001C003P006R002A008
+S002C002P011R002A030
+S002C003P008R001A020
+S002C003P010R002A010
+S002C003P011R002A007
+S002C003P011R002A011
+S002C003P014R002A007
+S003C001P019R001A055
+S003C002P002R002A055
+S003C002P018R002A055
+S003C003P002R001A055
+S003C003P016R001A055
+S003C003P018R002A024
+S004C002P003R001A013
+S004C002P008R001A009
+S004C002P020R001A003
+S004C002P020R001A004
+S004C002P020R001A012
+S004C002P020R001A020
+S004C002P020R001A021
+S004C002P020R001A036
+S005C002P004R001A001
+S005C002P004R001A003
+S005C002P010R001A016
+S005C002P010R001A017
+S005C002P010R001A048
+S005C002P010R001A049
+S005C002P016R001A009
+S005C002P016R001A010
+S005C002P018R001A003
+S005C002P018R001A028
+S005C002P018R001A029
+S005C003P016R002A009
+S005C003P018R002A013
+S005C003P021R002A057
+S006C001P001R002A055
+S006C002P007R001A005
+S006C002P007R001A006
+S006C002P016R001A043
+S006C002P016R001A051
+S006C002P016R001A052
+S006C002P022R001A012
+S006C002P023R001A020
+S006C002P023R001A021
+S006C002P023R001A022
+S006C002P023R001A023
+S006C002P024R001A018
+S006C002P024R001A019
+S006C003P001R002A013
+S006C003P007R002A009
+S006C003P007R002A010
+S006C003P007R002A025
+S006C003P016R001A060
+S006C003P017R001A055
+S006C003P017R002A013
+S006C003P017R002A014
+S006C003P017R002A015
+S006C003P022R002A013
+S007C001P018R002A050
+S007C001P025R002A051
+S007C001P028R001A050
+S007C001P028R001A051
+S007C001P028R001A052
+S007C002P008R002A008
+S007C002P015R002A055
+S007C002P026R001A008
+S007C002P026R001A009
+S007C002P026R001A010
+S007C002P026R001A011
+S007C002P026R001A012
+S007C002P026R001A050
+S007C002P027R001A011
+S007C002P027R001A013
+S007C002P028R002A055
+S007C003P007R001A002
+S007C003P007R001A004
+S007C003P019R001A060
+S007C003P027R002A001
+S007C003P027R002A002
+S007C003P027R002A003
+S007C003P027R002A004
+S007C003P027R002A005
+S007C003P027R002A006
+S007C003P027R002A007
+S007C003P027R002A008
+S007C003P027R002A009
+S007C003P027R002A010
+S007C003P027R002A011
+S007C003P027R002A012
+S007C003P027R002A013
+S008C002P001R001A009
+S008C002P001R001A010
+S008C002P001R001A014
+S008C002P001R001A015
+S008C002P001R001A016
+S008C002P001R001A018
+S008C002P001R001A019
+S008C002P008R002A059
+S008C002P025R001A060
+S008C002P029R001A004
+S008C002P031R001A005
+S008C002P031R001A006
+S008C002P032R001A018
+S008C002P034R001A018
+S008C002P034R001A019
+S008C002P035R001A059
+S008C002P035R002A002
+S008C002P035R002A005
+S008C003P007R001A009
+S008C003P007R001A016
+S008C003P007R001A017
+S008C003P007R001A018
+S008C003P007R001A019
+S008C003P007R001A020
+S008C003P007R001A021
+S008C003P007R001A022
+S008C003P007R001A023
+S008C003P007R001A025
+S008C003P007R001A026
+S008C003P007R001A028
+S008C003P007R001A029
+S008C003P007R002A003
+S008C003P008R002A050
+S008C003P025R002A002
+S008C003P025R002A011
+S008C003P025R002A012
+S008C003P025R002A016
+S008C003P025R002A020
+S008C003P025R002A022
+S008C003P025R002A023
+S008C003P025R002A030
+S008C003P025R002A031
+S008C003P025R002A032
+S008C003P025R002A033
+S008C003P025R002A049
+S008C003P025R002A060
+S008C003P031R001A001
+S008C003P031R002A004
+S008C003P031R002A014
+S008C003P031R002A015
+S008C003P031R002A016
+S008C003P031R002A017
+S008C003P032R002A013
+S008C003P033R002A001
+S008C003P033R002A011
+S008C003P033R002A012
+S008C003P034R002A001
+S008C003P034R002A012
+S008C003P034R002A022
+S008C003P034R002A023
+S008C003P034R002A024
+S008C003P034R002A044
+S008C003P034R002A045
+S008C003P035R002A016
+S008C003P035R002A017
+S008C003P035R002A018
+S008C003P035R002A019
+S008C003P035R002A020
+S008C003P035R002A021
+S009C002P007R001A001
+S009C002P007R001A003
+S009C002P007R001A014
+S009C002P008R001A014
+S009C002P015R002A050
+S009C002P016R001A002
+S009C002P017R001A028
+S009C002P017R001A029
+S009C003P017R002A030
+S009C003P025R002A054
+S010C001P007R002A020
+S010C002P016R002A055
+S010C002P017R001A005
+S010C002P017R001A018
+S010C002P017R001A019
+S010C002P019R001A001
+S010C002P025R001A012
+S010C003P007R002A043
+S010C003P008R002A003
+S010C003P016R001A055
+S010C003P017R002A055
+S011C001P002R001A008
+S011C001P018R002A050
+S011C002P008R002A059
+S011C002P016R002A055
+S011C002P017R001A020
+S011C002P017R001A021
+S011C002P018R002A055
+S011C002P027R001A009
+S011C002P027R001A010
+S011C002P027R001A037
+S011C003P001R001A055
+S011C003P002R001A055
+S011C003P008R002A012
+S011C003P015R001A055
+S011C003P016R001A055
+S011C003P019R001A055
+S011C003P025R001A055
+S011C003P028R002A055
+S012C001P019R001A060
+S012C001P019R002A060
+S012C002P015R001A055
+S012C002P017R002A012
+S012C002P025R001A060
+S012C003P008R001A057
+S012C003P015R001A055
+S012C003P015R002A055
+S012C003P016R001A055
+S012C003P017R002A055
+S012C003P018R001A055
+S012C003P018R001A057
+S012C003P019R002A011
+S012C003P019R002A012
+S012C003P025R001A055
+S012C003P027R001A055
+S012C003P027R002A009
+S012C003P028R001A035
+S012C003P028R002A055
+S013C001P015R001A054
+S013C001P017R002A054
+S013C001P018R001A016
+S013C001P028R001A040
+S013C002P015R001A054
+S013C002P017R002A054
+S013C002P028R001A040
+S013C003P008R002A059
+S013C003P015R001A054
+S013C003P017R002A054
+S013C003P025R002A022
+S013C003P027R001A055
+S013C003P028R001A040
+S014C001P027R002A040
+S014C002P015R001A003
+S014C002P019R001A029
+S014C002P025R002A059
+S014C002P027R002A040
+S014C002P039R001A050
+S014C003P007R002A059
+S014C003P015R002A055
+S014C003P019R002A055
+S014C003P025R001A048
+S014C003P027R002A040
+S015C001P008R002A040
+S015C001P016R001A055
+S015C001P017R001A055
+S015C001P017R002A055
+S015C002P007R001A059
+S015C002P008R001A003
+S015C002P008R001A004
+S015C002P008R002A040
+S015C002P015R001A002
+S015C002P016R001A001
+S015C002P016R002A055
+S015C003P008R002A007
+S015C003P008R002A011
+S015C003P008R002A012
+S015C003P008R002A028
+S015C003P008R002A040
+S015C003P025R002A012
+S015C003P025R002A017
+S015C003P025R002A020
+S015C003P025R002A021
+S015C003P025R002A030
+S015C003P025R002A033
+S015C003P025R002A034
+S015C003P025R002A036
+S015C003P025R002A037
+S015C003P025R002A044
+S016C001P019R002A040
+S016C001P025R001A011
+S016C001P025R001A012
+S016C001P025R001A060
+S016C001P040R001A055
+S016C001P040R002A055
+S016C002P008R001A011
+S016C002P019R002A040
+S016C002P025R002A012
+S016C003P008R001A011
+S016C003P008R002A002
+S016C003P008R002A003
+S016C003P008R002A004
+S016C003P008R002A006
+S016C003P008R002A009
+S016C003P019R002A040
+S016C003P039R002A016
+S017C001P016R002A031
+S017C002P007R001A013
+S017C002P008R001A009
+S017C002P015R001A042
+S017C002P016R002A031
+S017C002P016R002A055
+S017C003P007R002A013
+S017C003P008R001A059
+S017C003P016R002A031
+S017C003P017R001A055
+S017C003P020R001A059
diff --git a/tools/data/skeleton/README.md b/tools/data/skeleton/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..00c1814d16c37cff82526197eedca1855583d8c9
--- /dev/null
+++ b/tools/data/skeleton/README.md
@@ -0,0 +1,129 @@
+# Preparing Skeleton Dataset
+
+<!-- [DATASET] -->
+
+```BibTeX
+@misc{duan2021revisiting,
+      title={Revisiting Skeleton-based Action Recognition},
+      author={Haodong Duan and Yue Zhao and Kai Chen and Dian Shao and Dahua Lin and Bo Dai},
+      year={2021},
+      eprint={2104.13586},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+## Introduction
+
+We release the skeleton annotations used in [Revisiting Skeleton-based Action Recognition](https://arxiv.org/abs/2104.13586). By default, we use [Faster-RCNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) with ResNet50 backbone for human detection and [HRNet-w32](https://github.com/open-mmlab/mmpose/blob/master/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py) for single person pose estimation. For FineGYM, we use Ground-Truth bounding boxes for the athlete instead of detection bounding boxes.
+
+## Prepare Annotations
+
+We provide links to the pre-processed skeleton annotations, you can directly download them and use them for training & testing.
+
+- NTURGB+D \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_2d.pkl
+- NTURGB+D \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_3d.pkl
+- NTURGB+D 120 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_2d.pkl
+- NTURGB+D 120 \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_3d.pkl
+- GYM \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/gym_2d.pkl
+  - GYM 2D skeletons are extracted with ground-truth human bounding boxes, which can be downloaded with [link](https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl). Please cite [PoseConv3D](https://arxiv.org/abs/2104.13586) if you use it in your project.
+- UCF101 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ucf101_2d.pkl
+- HMDB51 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/hmdb51_2d.pkl
+- Diving48 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/diving48_2d.pkl
+- Kinetics400 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/k400_2d.pkl (Table of contents only, no skeleton annotations)
+
+For Kinetics400, since the skeleton annotations are large, we do not provide the direct download links on aliyun. Please use the following link to download the `k400_kpfiles_2d.zip` and extract it under `$MMACTION2/data/skeleton/kpfiles` for Kinetics400 training & testing: https://openxlab.org.cn/datasets/OpenMMLab/Kinetics400-skeleton
+
+If you want to generate 2D skeleton annotations of specified video, please install mmdetection and mmpose first, then use the following script to extract skeleton annotations of NTURGB+D video:
+
+```python
+python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.pkl
+```
+
+please note that, due to the upgrade of mmpose, the inference results may have slight differences from the provided skeleton annotations.
+
+## The Format of Annotations
+
+Each pickle file corresponds to an action recognition dataset. The content of a pickle file is a dictionary with two fields: `split` and `annotations`
+
+1. Split: The value of the `split` field is a dictionary: the keys are the split names, while the values are lists of video identifiers that belong to the specific clip.
+2. Annotations: The value of the `annotations` field is a list of skeleton annotations, each skeleton annotation is a dictionary, containing the following fields:
+   1. `frame_dir` (str): The identifier of the corresponding video.
+   2. `total_frames` (int): The number of frames in this video.
+   3. `img_shape` (tuple\[int\]): The shape of a video frame, a tuple with two elements, in the format of (height, width). Only required for 2D skeletons.
+   4. `original_shape` (tuple\[int\]): Same as `img_shape`.
+   5. `label` (int): The action label.
+   6. `keypoint` (np.ndarray, with shape \[M x T x V x C\]): The keypoint annotation. M: number of persons; T: number of frames (same as `total_frames`); V: number of keypoints (25 for NTURGB+D 3D skeleton, 17 for CoCo, 18 for OpenPose, etc. ); C: number of dimensions for keypoint coordinates (C=2 for 2D keypoint, C=3 for 3D keypoint).
+   7. `keypoint_score` (np.ndarray, with shape \[M x T x V\]): The confidence score of keypoints. Only required for 2D skeletons.
+
+## Visualization
+
+For skeleton data visualization, you need also to prepare the RGB videos. Please refer to \[visualize_heatmap_volume\] for detailed process. Here we provide some visualization examples from NTU-60 and FineGYM.
+
+<table>
+<thead>
+  <tr>
+    <td>
+<div align="center">
+  <b> Pose Estimation Results </b>
+  <br/>
+  <img src="https://user-images.githubusercontent.com/34324155/116529341-6fc95080-a90f-11eb-8f0d-57fdb35d1ba4.gif" width="455"/>
+  <br/>
+  <br/>
+  <img src="https://user-images.githubusercontent.com/34324155/116531676-04cd4900-a912-11eb-8db4-a93343bedd01.gif" width="455"/>
+</div></td>
+    <td>
+<div align="center">
+  <b> Keypoint Heatmap Volume Visualization </b>
+  <br/>
+  <img src="https://user-images.githubusercontent.com/34324155/116529336-6dff8d00-a90f-11eb-807e-4d9168997655.gif" width="256"/>
+  <br/>
+  <br/>
+  <img src="https://user-images.githubusercontent.com/34324155/116531658-00a12b80-a912-11eb-957b-561c280a86da.gif" width="256"/>
+</div></td>
+    <td>
+<div align="center">
+  <b> Limb Heatmap Volume Visualization </b>
+  <br/>
+  <img src="https://user-images.githubusercontent.com/34324155/116529322-6a6c0600-a90f-11eb-81df-6fbb36230bd0.gif" width="256"/>
+  <br/>
+  <br/>
+  <img src="https://user-images.githubusercontent.com/34324155/116531649-fed76800-a911-11eb-8ca9-0b4e58f43ad9.gif" width="256"/>
+</div></td>
+  </tr>
+</thead>
+</table>
+
+## Convert the NTU RGB+D raw skeleton data to our format (only applicable to GCN backbones)
+
+Here we also provide the script for converting the NTU RGB+D raw skeleton data to our format.
+First, download the raw skeleton data of NTU-RGBD 60 and NTU-RGBD 120 from https://github.com/shahroudy/NTURGB-D.
+
+For NTU-RGBD 60, preprocess data and convert the data format with
+
+```python
+python gen_ntu_rgbd_raw.py --data-path your_raw_nturgbd60_skeleton_path --ignored-sample-path NTU_RGBD_samples_with_missing_skeletons.txt --out-folder your_nturgbd60_output_path --task ntu60
+```
+
+For NTU-RGBD 120, preprocess data and convert the data format with
+
+```python
+python gen_ntu_rgbd_raw.py --data-path your_raw_nturgbd120_skeleton_path --ignored-sample-path NTU_RGBD120_samples_with_missing_skeletons.txt --out-folder your_nturgbd120_output_path --task ntu120
+```
+
+## Convert annotations from third-party projects
+
+We provide scripts to convert skeleton annotations from third-party projects to MMAction2 formats:
+
+- BABEL: `babel2mma2.py`
+
+**TODO**:
+
+- [x] FineGYM
+- [x] NTU60_XSub
+- [x] NTU120_XSub
+- [x] NTU60_XView
+- [x] NTU120_XSet
+- [x] UCF101
+- [x] HMDB51
+- [x] Kinetics
diff --git a/tools/data/skeleton/README_zh-CN.md b/tools/data/skeleton/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..19973f73240d81833a3e368400160266f107e50b
--- /dev/null
+++ b/tools/data/skeleton/README_zh-CN.md
@@ -0,0 +1,142 @@
+# 准备骨架数据集
+
+```BibTeX
+@misc{duan2021revisiting,
+      title={Revisiting Skeleton-based Action Recognition},
+      author={Haodong Duan and Yue Zhao and Kai Chen and Dian Shao and Dahua Lin and Bo Dai},
+      year={2021},
+      eprint={2104.13586},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+## 简介
+
+MMAction2 发布 [Revisiting Skeleton-based Action Recognition](https://arxiv.org/abs/2104.13586) 论文中所使用的骨架标注。
+默认使用 [Faster-RCNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) 作为人体检测器，
+使用 [HRNet-w32](https://github.com/open-mmlab/mmpose/blob/master/configs/top_down/hrnet/coco/hrnet_w32_coco_256x192.py) 作为单人姿态估计模型。
+对于 FineGYM 数据集，MMAction2 使用的是运动员的真实框标注，而非检测器所出的框。目前，MMAction2 已发布 FineGYM 和 NTURGB-D Xsub 部分的骨架标注，其他数据集的标注也将很快发布。
+
+## 准备标注文件
+
+目前，MMAction2 支持 HMDB51, UCF101, FineGYM 和 NTURGB+D 数据集。对于 FineGYM 数据集，用户可以使用以下脚本下载标注文件。
+
+```shell
+bash download_annotations.sh ${DATASET}
+```
+
+由于 NTURGB+D 数据集的 [使用条例](http://rose1.ntu.edu.sg/Datasets/actionRecognition.asp)，MMAction2 并未直接发布实验中所使用的标注文件。
+因此，这里提供生成 NTURGB+D 数据集中视频的姿态标注文件，这将生成一个 dict 数据并将其保存为一个 pickle 文件。
+用户可以生成一个 list 用以包含对应视频的 dict 数据，并将其保存为一个 pickle 文件。
+之后，用户可以获得 `ntu60_xsub_train.pkl`, `ntu60_xsub_val.pkl`, `ntu120_xsub_train.pkl`, `ntu120_xsub_val.pkl` 文件用于训练。
+
+对于无法进行姿态提取的用户，这里提供了上述流程的输出结果，分别对应 NTURGB-D 数据集的 4 个部分：
+
+- NTURGB+D \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_2d.pkl
+- NTURGB+D \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_3d.pkl
+- NTURGB+D 120 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_2d.pkl
+- NTURGB+D 120 \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_3d.pkl
+- GYM \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/gym_2d.pkl
+  - GYM 2D 姿态标注文件是基于运动员的真实标注框生成的，用户可以从这个[链接](https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl)下载真实标注框。如果你在项目中使用了该数据，请引用 [PoseConv3D](https://arxiv.org/abs/2104.13586)
+- UCF101 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ucf101_2d.pkl
+- HMDB51 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/hmdb51_2d.pkl
+- Diving48 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/diving48_2d.pkl
+- Kinetics400 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/k400_2d.pkl (只包含数据列表，没有姿态标注文件)
+
+由于 Kinetics400 数据集姿态标注文件过大，我们不提供阿里云的下载链接，请使用此[链接](https://openxlab.org.cn/datasets/OpenMMLab/Kinetics400-skeleton)下载 `k400_kpfiles_2d.zip`，解压到 `$MMACTION2/data/skeleton/kpfiles` 目录下，用于 Kinetics400 的训练和测试。
+
+若想生成单个视频的 2D 姿态标注文件，用户在安装 mmdetection 和 mmpose 之后，可使用以下脚本进行 NTURGB+D 视频的姿态提取：
+
+```python
+python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.pkl
+```
+
+请注意，由于 mmpose 算法库升级，此脚本的推理结果与提供的姿态点数据集可能略有差异。
+
+在用户获得数据集某部分所有视频的姿态标注文件（如 `ntu60_xsub_val`）后，可以将其集合成一个 list 数据并保存为 `ntu60_xsub_val.pkl`。用户可用这些大型 pickle 文件进行训练和测试。
+
+## PoseC3D 的标注文件格式
+
+这里简单介绍 PoseC3D 的标注文件格式。以 `gym_train.pkl` 为例：`gym_train.pkl` 存储一个长度为 20484 的 list，list 的每一项为单个视频的骨架标注 dict。每个 dict 的内容如下：
+
+- keypoint：关键点坐标，大小为 N（#人数）x T（时序长度）x K（#关键点, 这里为17）x 2 （x，y 坐标）的 numpy array 数据类型
+- keypoint_score：关键点的置信分数，大小为 N（#人数）x T（时序长度）x K（#关键点, 这里为17）的 numpy array 数据类型
+- frame_dir: 对应视频名
+- label: 动作类别
+- img_shape: 每一帧图像的大小
+- original_shape: 同 `img_shape`
+- total_frames: 视频时序长度
+
+如用户想使用自己的数据集训练 PoseC3D，可以参考 [Custom Dataset Training](https://github.com/open-mmlab/mmaction2/blob/master/configs/skeleton/posec3d/custom_dataset_training.md)。
+
+## 可视化
+
+为了可视化骨架数据，用户需要准备 RGB 的视频。详情可参考 \[visualize_heatmap_volume\]。这里提供一些 NTU-60 和 FineGYM 上的例子
+
+<table>
+<thead>
+  <tr>
+    <td>
+<div align="center">
+  <b> 姿态估计结果 </b>
+  <br/>
+  <img src="https://user-images.githubusercontent.com/34324155/116529341-6fc95080-a90f-11eb-8f0d-57fdb35d1ba4.gif" width="455"/>
+  <br/>
+  <br/>
+  <img src="https://user-images.githubusercontent.com/34324155/116531676-04cd4900-a912-11eb-8db4-a93343bedd01.gif" width="455"/>
+</div></td>
+    <td>
+<div align="center">
+  <b> 关键点热力图三维可视化 </b>
+  <br/>
+  <img src="https://user-images.githubusercontent.com/34324155/116529336-6dff8d00-a90f-11eb-807e-4d9168997655.gif" width="256"/>
+  <br/>
+  <br/>
+  <img src="https://user-images.githubusercontent.com/34324155/116531658-00a12b80-a912-11eb-957b-561c280a86da.gif" width="256"/>
+</div></td>
+    <td>
+<div align="center">
+  <b> 肢体热力图三维可视化 </b>
+  <br/>
+  <img src="https://user-images.githubusercontent.com/34324155/116529322-6a6c0600-a90f-11eb-81df-6fbb36230bd0.gif" width="256"/>
+  <br/>
+  <br/>
+  <img src="https://user-images.githubusercontent.com/34324155/116531649-fed76800-a911-11eb-8ca9-0b4e58f43ad9.gif" width="256"/>
+</div></td>
+  </tr>
+</thead>
+</table>
+
+## 如何将 NTU RGB+D 原始数据转化为 MMAction2 格式 （转换好的标注文件目前仅适用于 GCN 模型）
+
+这里介绍如何将 NTU RGB+D 原始数据转化为 MMAction2 格式。首先，需要从 https://github.com/shahroudy/NTURGB-D 下载原始 NTU-RGBD 60 和 NTU-RGBD 120 数据集的原始骨架数据。
+
+对于 NTU-RGBD 60 数据集，可使用以下脚本
+
+```python
+python gen_ntu_rgbd_raw.py --data-path your_raw_nturgbd60_skeleton_path --ignored-sample-path NTU_RGBD_samples_with_missing_skeletons.txt --out-folder your_nturgbd60_output_path --task ntu60
+```
+
+对于 NTU-RGBD 120 数据集，可使用以下脚本
+
+```python
+python gen_ntu_rgbd_raw.py --data-path your_raw_nturgbd120_skeleton_path --ignored-sample-path NTU_RGBD120_samples_with_missing_skeletons.txt --out-folder your_nturgbd120_output_path --task ntu120
+```
+
+## 转换其他第三方项目的骨骼标注
+
+MMAction2 提供脚本以将其他第三方项目的骨骼标注转至 MMAction2 格式，如：
+
+- BABEL: `babel2mma2.py`
+
+**待办项**：
+
+- [x] FineGYM
+- [x] NTU60_XSub
+- [x] NTU120_XSub
+- [x] NTU60_XView
+- [x] NTU120_XSet
+- [x] UCF101
+- [x] HMDB51
+- [x] Kinetics
diff --git a/tools/data/skeleton/S001C001P001R001A001_rgb.avi b/tools/data/skeleton/S001C001P001R001A001_rgb.avi
new file mode 100644
index 0000000000000000000000000000000000000000..62b6258288262fc35a36ea5f344fda03f7d5d044
--- /dev/null
+++ b/tools/data/skeleton/S001C001P001R001A001_rgb.avi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92cea2b398b79fc619d545a042cd0bce8ace9aa4d482b8c1e30482311f204c4a
+size 987146
diff --git a/tools/data/skeleton/babel2mma2.py b/tools/data/skeleton/babel2mma2.py
new file mode 100644
index 0000000000000000000000000000000000000000..67bcf6e30a3e45d2985dd901c92ea0a059d598e3
--- /dev/null
+++ b/tools/data/skeleton/babel2mma2.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# In this example, we convert babel120_train to MMAction2 format
+# The required files can be downloaded from the homepage of BABEL project
+import numpy as np
+from mmcv import dump, load
+
+
+def gen_babel(x, y):
+    data = []
+    for i, xx in enumerate(x):
+        sample = dict()
+        sample['keypoint'] = xx.transpose(3, 1, 2, 0).astype(np.float16)
+        sample['label'] = y[1][0][i]
+        names = [y[0][i], y[1][1][i], y[1][2][i], y[1][3][i]]
+        sample['frame_dir'] = '_'.join([str(k) for k in names])
+        sample['total_frames'] = 150
+        data.append(sample)
+    return data
+
+
+x = np.load('train_ntu_sk_120.npy')
+y = load('train_label_120.pkl')
+
+data = gen_babel(x, y)
+dump(data, 'babel120_train.pkl')
diff --git a/tools/data/skeleton/compress_nturgbd.py b/tools/data/skeleton/compress_nturgbd.py
new file mode 100644
index 0000000000000000000000000000000000000000..e021f476777c7ccb1e722842e83bf0bef28b31b2
--- /dev/null
+++ b/tools/data/skeleton/compress_nturgbd.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import multiprocessing as mp
+import os
+import os.path as osp
+import subprocess
+
+
+def get_shape(vid):
+    cmd = 'ffprobe -v error -select_streams v:0 -show_entries ' \
+          'stream=width,height -of csv=s=x:p=0 \"{}\"'.format(vid)
+    w, h = subprocess.check_output(cmd, shell=True).decode('utf-8').split('x')
+    return int(w), int(h)
+
+
+def compress(src, dest, shape=None, target_size=540, fps=-1):
+    if shape is None:
+        shape = get_shape(src)
+    w, h = shape
+    scale_str = f'-vf scale=-2:{target_size}' if w >= h else \
+        f'-vf scale={target_size}:-2'
+    fps_str = f'-r {fps}' if fps > 0 else ''
+    quality_str = '-q:v 1'
+    vcodec_str = '-c:v libx264'
+    cmd = f'ffmpeg -y -loglevel error -i {src} -threads 1 ' \
+          f'{quality_str} {scale_str} {fps_str} {vcodec_str} {dest}'
+    os.system(cmd)
+
+
+def compress_nturgbd(name):
+    src = name
+    dest = src.replace('nturgbd_raw',
+                       'nturgbd_videos').replace('_rgb.avi', '.mp4')
+    shape = (1920, 1080)
+    compress(src, dest, shape)
+
+
+src_dir = 'data/nturgbd_raw'
+tgt_dir = 'data/nturgbd_videos'
+os.makedirs(tgt_dir, exist_ok=True)
+files = [osp.join(src_dir, x) for x in os.listdir(src_dir) if '.avi' in x]
+pool = mp.Pool(32)
+pool.map(compress_nturgbd, files)
diff --git a/tools/data/skeleton/gen_ntu_rgbd_raw.py b/tools/data/skeleton/gen_ntu_rgbd_raw.py
new file mode 100644
index 0000000000000000000000000000000000000000..57d6e35b110480a2aaf6af00e0660548b7465d6b
--- /dev/null
+++ b/tools/data/skeleton/gen_ntu_rgbd_raw.py
@@ -0,0 +1,217 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+from typing import Dict, List, Optional, Tuple
+
+import mmengine
+import numpy as np
+
+training_subjects_60 = [
+    1, 2, 4, 5, 8, 9, 13, 14, 15, 16, 17, 18, 19, 25, 27, 28, 31, 34, 35, 38
+]
+training_cameras_60 = [2, 3]
+training_subjects_120 = [
+    1, 2, 4, 5, 8, 9, 13, 14, 15, 16, 17, 18, 19, 25, 27, 28, 31, 34, 35, 38,
+    45, 46, 47, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 70, 74, 78, 80, 81, 82,
+    83, 84, 85, 86, 89, 91, 92, 93, 94, 95, 97, 98, 100, 103
+]
+training_setups_120 = [
+    2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32
+]
+max_body_true = 2
+max_body_kinect = 4
+num_joint = 25
+max_frame = 300
+
+
+def read_skeleton_filter(file: str) -> Dict:
+    with open(file, 'r') as f:
+        skeleton_sequence = {'num_frame': int(f.readline()), 'frameInfo': []}
+
+        for t in range(skeleton_sequence['num_frame']):
+            frame_info = {'numBody': int(f.readline()), 'bodyInfo': []}
+
+            for m in range(frame_info['numBody']):
+                body_info_key = [
+                    'bodyID', 'clipedEdges', 'handLeftConfidence',
+                    'handLeftState', 'handRightConfidence', 'handRightState',
+                    'isResticted', 'leanX', 'leanY', 'trackingState'
+                ]
+                body_info = {
+                    k: float(v)
+                    for k, v in zip(body_info_key,
+                                    f.readline().split())
+                }
+                body_info['numJoint'] = int(f.readline())
+                body_info['jointInfo'] = []
+                for v in range(body_info['numJoint']):
+                    joint_info_key = [
+                        'x', 'y', 'z', 'depthX', 'depthY', 'colorX', 'colorY',
+                        'orientationW', 'orientationX', 'orientationY',
+                        'orientationZ', 'trackingState'
+                    ]
+                    joint_info = {
+                        k: float(v)
+                        for k, v in zip(joint_info_key,
+                                        f.readline().split())
+                    }
+                    body_info['jointInfo'].append(joint_info)
+                frame_info['bodyInfo'].append(body_info)
+            skeleton_sequence['frameInfo'].append(frame_info)
+
+    return skeleton_sequence
+
+
+def get_nonzero_std(s: np.ndarray) -> float:  # T V C
+    index = s.sum(-1).sum(-1) != 0
+    s = s[index]
+    if len(s) != 0:
+        s = s[:, :, 0].std() + \
+            s[:, :, 1].std() + \
+            s[:, :, 2].std()  # three channels
+    else:
+        s = 0
+    return s
+
+
+def read_xyz(file: str, max_body: int = 4, num_joint: int = 25) -> np.ndarray:
+    seq_info = read_skeleton_filter(file)
+    data = np.zeros((max_body, seq_info['num_frame'], num_joint, 3))
+    for n, f in enumerate(seq_info['frameInfo']):
+        for m, b in enumerate(f['bodyInfo']):
+            for j, v in enumerate(b['jointInfo']):
+                if m < max_body and j < num_joint:
+                    data[m, n, j, :] = [v['x'], v['y'], v['z']]
+                else:
+                    pass
+
+    # select two max energy body
+    energy = np.array([get_nonzero_std(x) for x in data])
+    index = energy.argsort()[::-1][0:max_body_true]
+    data = data[index]
+
+    # filter padding body
+    data = data[data.sum((1, 2, 3)) != 0]
+    return data
+
+
+def get_names_and_labels(data_path: str,
+                         task: str,
+                         benchmark: str,
+                         ignored_samples: Optional[List[str]] = None) -> Tuple:
+    train_names = []
+    train_labels = []
+    val_names = []
+    val_labels = []
+
+    for filename in os.listdir(data_path):
+        if ignored_samples is not None and filename in ignored_samples:
+            continue
+
+        setup_number = int(filename[filename.find('S') + 1:filename.find('S') +
+                                    4])
+        action_class = int(filename[filename.find('A') + 1:filename.find('A') +
+                                    4])
+        subject_id = int(filename[filename.find('P') + 1:filename.find('P') +
+                                  4])
+        camera_id = int(filename[filename.find('C') + 1:filename.find('C') +
+                                 4])
+
+        if benchmark == 'xsub':
+            if task == 'ntu60':
+                istraining = (subject_id in training_subjects_60)
+            else:
+                istraining = (subject_id in training_subjects_120)
+        elif benchmark == 'xview':
+            istraining = (camera_id in training_cameras_60)
+        elif benchmark == 'xset':
+            istraining = (setup_number in training_setups_120)
+        else:
+            raise ValueError()
+
+        if istraining:
+            train_names.append(filename)
+            train_labels.append(action_class - 1)
+        else:
+            val_names.append(filename)
+            val_labels.append(action_class - 1)
+
+    return train_names, train_labels, val_names, val_labels
+
+
+def gendata(data_path: str,
+            out_path: str,
+            ignored_sample_path: Optional[str] = None,
+            task: str = 'ntu60') -> None:
+    split = dict()
+
+    if ignored_sample_path is not None:
+        with open(ignored_sample_path, 'r') as f:
+            ignored_samples = [
+                line.strip() + '.skeleton' for line in f.readlines()
+            ]
+    else:
+        ignored_samples = []
+
+    if task == 'ntu60':
+        benchmarks = ['xsub', 'xview']
+    else:
+        benchmarks = ['xsub', 'xset']
+
+    names = None
+    labels = None
+    for benchmark in benchmarks:
+        train_names, train_labels, val_names, val_labels = \
+            get_names_and_labels(data_path, task, benchmark, ignored_samples)
+        split[f'{benchmark}_train'] = [osp.splitext(s)[0] for s in train_names]
+        split[f'{benchmark}_val'] = [osp.splitext(s)[0] for s in val_names]
+
+        if names is None and labels is None:
+            names = train_names + val_names
+            labels = train_labels + val_labels
+
+    results = []
+
+    prog_bar = mmengine.ProgressBar(len(names))
+    for i, s in enumerate(names):
+        ske = read_xyz(
+            osp.join(data_path, s),
+            max_body=max_body_kinect,
+            num_joint=num_joint).astype(np.float16)
+
+        anno = dict()
+        anno['frame_dir'] = osp.splitext(s)[0]
+        anno['label'] = labels[i]
+        anno['keypoint'] = ske
+        anno['total_frames'] = ske.shape[1]
+        results.append(anno)
+        prog_bar.update()
+
+    annotations = {'split': split, 'annotations': results}
+    mmengine.dump(annotations, f'{out_path}/{task}_3d.pkl')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Generate Pose Annotation for NTURGB-D raw skeleton data')
+    parser.add_argument(
+        '--data-path',
+        type=str,
+        help='raw skeleton data path',
+        default='../../../data/ntu60/nturgb+d_skeletons/')
+    parser.add_argument(
+        '--ignored-sample-path',
+        type=str,
+        default='NTU_RGBD_samples_with_missing_skeletons.txt')
+    parser.add_argument(
+        '--out-folder', type=str, default='../../../data/skeleton/')
+    parser.add_argument('--task', type=str, default='ntu60')
+    args = parser.parse_args()
+
+    assert args.task in ['ntu60', 'ntu120']
+
+    mmengine.mkdir_or_exist(args.out_folder)
+
+    gendata(args.data_path, args.out_folder, args.ignored_sample_path,
+            args.task)
diff --git a/tools/data/skeleton/label_map_gym99.txt b/tools/data/skeleton/label_map_gym99.txt
new file mode 100644
index 0000000000000000000000000000000000000000..79f4010816b30047a74b1b71786c7d94770ebdbb
--- /dev/null
+++ b/tools/data/skeleton/label_map_gym99.txt
@@ -0,0 +1,99 @@
+(VT) round-off, flic-flac with 0.5 turn on, stretched salto forward with 0.5 turn off
+(VT) round-off, flic-flac on, stretched salto backward with 2 turn off
+(VT) round-off, flic-flac on, stretched salto backward with 1 turn off
+(VT) round-off, flic-flac on, stretched salto backward with 1.5 turn off
+(VT) round-off, flic-flac on, stretched salto backward with 2.5 turn off
+(VT) round-off, flic-flac on, stretched salto backward off
+(FX) switch leap with 0.5 turn
+(FX) switch leap with 1 turn
+(FX) split leap with 1 turn
+(FX) split leap with 1.5 turn or more
+(FX) switch leap (leap forward with leg change to cross split)
+(FX) split jump with 1 turn
+(FX) split jump (leg separation 180 degree parallel to the floor)
+(FX) johnson with additional 0.5 turn
+(FX) straddle pike or side split jump with 1 turn
+(FX) switch leap to ring position
+(FX) stag jump
+(FX) 2 turn with free leg held upward in 180 split position throughout turn
+(FX) 2 turn in tuck stand on one leg, free leg straight throughout turn
+(FX) 3 turn on one leg, free leg optional below horizontal
+(FX) 2 turn on one leg, free leg optional below horizontal
+(FX) 1 turn on one leg, free leg optional below horizontal
+(FX) 2 turn or more with heel of free leg forward at horizontal throughout turn
+(FX) 1 turn with heel of free leg forward at horizontal throughout turn
+(FX) arabian double salto tucked
+(FX) salto forward tucked
+(FX) aerial walkover forward
+(FX) salto forward stretched with 2 twist
+(FX) salto forward stretched with 1 twist
+(FX) salto forward stretched with 1.5 twist
+(FX) salto forward stretched, feet land together
+(FX) double salto backward stretched
+(FX) salto backward stretched with 3 twist
+(FX) salto backward stretched with 2 twist
+(FX) salto backward stretched with 2.5 twist
+(FX) salto backward stretched with 1.5 twist
+(FX) double salto backward tucked with 2 twist
+(FX) double salto backward tucked with 1 twist
+(FX) double salto backward tucked
+(FX) double salto backward piked with 1 twist
+(FX) double salto backward piked
+(BB) sissone (leg separation 180 degree on the diagonal to the floor, take off two feet, land on one foot)
+(BB) split jump with 0.5 turn in side position
+(BB) split jump
+(BB) straddle pike jump or side split jump
+(BB) split ring jump (ring jump with front leg horizontal to the floor)
+(BB) switch leap with 0.5 turn
+(BB) switch leap (leap forward with leg change)
+(BB) split leap forward
+(BB) johnson (leap forward with leg change and 0.25 turn to side split or straddle pike position)
+(BB) switch leap to ring position
+(BB) sheep jump (jump with upper back arch and head release with feet to head height/closed Ring)
+(BB) wolf hop or jump (hip angle at 45, knees together)
+(BB) 1 turn with heel of free leg forward at horizontal throughout turn
+(BB) 2 turn on one leg, free leg optional below horizontal
+(BB) 1 turn on one leg, free leg optional below horizontal
+(BB) 2 turn in tuck stand on one leg, free leg optional
+(BB) salto backward tucked with 1 twist
+(BB) salto backward tucked
+(BB) salto backward stretched-step out (feet land successively)
+(BB) salto backward stretched with legs together
+(BB) salto sideward tucked, take off from one leg to side stand
+(BB) free aerial cartwheel landing in cross position
+(BB) salto forward tucked to cross stand
+(BB) free aerial walkover forward, landing on one or both feet
+(BB) jump backward, flic-flac take-off with 0.5 twist through handstand to walkover forward, also with support on one arm
+(BB) flic-flac to land on both feet
+(BB) flic-flac with step-out, also with support on one arm
+(BB) round-off
+(BB) double salto backward tucked
+(BB) salto backward tucked
+(BB) double salto backward piked
+(BB) salto backward stretched with 2 twist
+(BB) salto backward stretched with 2.5 twist
+(UB) pike sole circle backward with 1 turn to handstand
+(UB) pike sole circle backward with 0.5 turn to handstand
+(UB) pike sole circle backward to handstand
+(UB) giant circle backward with 1 turn to handstand
+(UB) giant circle backward with 0.5 turn to handstand
+(UB) giant circle backward
+(UB) giant circle forward with 1 turn on one arm before handstand phase
+(UB) giant circle forward with 0.5 turn to handstand
+(UB) giant circle forward
+(UB) clear hip circle backward to handstand
+(UB) clear pike circle backward with 1 turn to handstand
+(UB) clear pike circle backward with 0.5 turn to handstand
+(UB) clear pike circle backward to handstand
+(UB) stalder backward with 1 turn to handstand
+(UB) stalder backward to handstand
+(UB) counter straddle over high bar to hang
+(UB) counter piked over high bar to hang
+(UB) (swing backward or front support) salto forward straddled to hang on high bar
+(UB) (swing backward) salto forward piked to hang on high bar
+(UB) (swing forward or hip circle backward) salto backward with 0.5 turn piked to hang on high bar
+(UB) transition flight from high bar to low bar
+(UB) transition flight from low bar to high bar
+(UB) (swing forward) double salto backward tucked with 1 turn
+(UB) (swing backward) double salto forward tucked
+(UB) (swing forward) double salto backward stretched
diff --git a/tools/data/skeleton/label_map_ntu60.txt b/tools/data/skeleton/label_map_ntu60.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a39bae255609734f7c77bb053179c40fcfbef75d
--- /dev/null
+++ b/tools/data/skeleton/label_map_ntu60.txt
@@ -0,0 +1,60 @@
+drink water
+eat meal/snack
+brushing teeth
+brushing hair
+drop
+pickup
+throw
+sitting down
+standing up (from sitting position)
+clapping
+reading
+writing
+tear up paper
+wear jacket
+take off jacket
+wear a shoe
+take off a shoe
+wear on glasses
+take off glasses
+put on a hat/cap
+take off a hat/cap
+cheer up
+hand waving
+kicking something
+reach into pocket
+hopping (one foot jumping)
+jump up
+make a phone call/answer phone
+playing with phone/tablet
+typing on a keyboard
+pointing to something with finger
+taking a selfie
+check time (from watch)
+rub two hands together
+nod head/bow
+shake head
+wipe face
+salute
+put the palms together
+cross hands in front (say stop)
+sneeze/cough
+staggering
+falling
+touch head (headache)
+touch chest (stomachache/heart pain)
+touch back (backache)
+touch neck (neckache)
+nausea or vomiting condition
+use a fan (with hand or paper)/feeling warm
+punching/slapping other person
+kicking other person
+pushing other person
+pat on back of other person
+point finger at the other person
+hugging other person
+giving something to other person
+touch other person's pocket
+handshaking
+walking towards each other
+walking apart from each other
diff --git a/tools/data/skeleton/ntu_pose_extraction.py b/tools/data/skeleton/ntu_pose_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ee57874903951eeffd24bc64a77b70e594d3dab
--- /dev/null
+++ b/tools/data/skeleton/ntu_pose_extraction.py
@@ -0,0 +1,305 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import abc
+import argparse
+import os.path as osp
+from collections import defaultdict
+from tempfile import TemporaryDirectory
+
+import mmengine
+import numpy as np
+
+from mmaction.apis import detection_inference, pose_inference
+from mmaction.utils import frame_extract
+
+args = abc.abstractproperty()
+args.det_config = 'demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py'  # noqa: E501
+args.det_checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501
+args.det_score_thr = 0.5
+args.pose_config = 'demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py'  # noqa: E501
+args.pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth'  # noqa: E501
+
+
+def intersection(b0, b1):
+    l, r = max(b0[0], b1[0]), min(b0[2], b1[2])
+    u, d = max(b0[1], b1[1]), min(b0[3], b1[3])
+    return max(0, r - l) * max(0, d - u)
+
+
+def iou(b0, b1):
+    i = intersection(b0, b1)
+    u = area(b0) + area(b1) - i
+    return i / u
+
+
+def area(b):
+    return (b[2] - b[0]) * (b[3] - b[1])
+
+
+def removedup(bbox):
+
+    def inside(box0, box1, threshold=0.8):
+        return intersection(box0, box1) / area(box0) > threshold
+
+    num_bboxes = bbox.shape[0]
+    if num_bboxes == 1 or num_bboxes == 0:
+        return bbox
+    valid = []
+    for i in range(num_bboxes):
+        flag = True
+        for j in range(num_bboxes):
+            if i != j and inside(bbox[i],
+                                 bbox[j]) and bbox[i][4] <= bbox[j][4]:
+                flag = False
+                break
+        if flag:
+            valid.append(i)
+    return bbox[valid]
+
+
+def is_easy_example(det_results, num_person):
+    threshold = 0.95
+
+    def thre_bbox(bboxes, threshold=threshold):
+        shape = [sum(bbox[:, -1] > threshold) for bbox in bboxes]
+        ret = np.all(np.array(shape) == shape[0])
+        return shape[0] if ret else -1
+
+    if thre_bbox(det_results) == num_person:
+        det_results = [x[x[..., -1] > 0.95] for x in det_results]
+        return True, np.stack(det_results)
+    return False, thre_bbox(det_results)
+
+
+def bbox2tracklet(bbox):
+    iou_thre = 0.6
+    tracklet_id = -1
+    tracklet_st_frame = {}
+    tracklets = defaultdict(list)
+    for t, box in enumerate(bbox):
+        for idx in range(box.shape[0]):
+            matched = False
+            for tlet_id in range(tracklet_id, -1, -1):
+                cond1 = iou(tracklets[tlet_id][-1][-1], box[idx]) >= iou_thre
+                cond2 = (
+                    t - tracklet_st_frame[tlet_id] - len(tracklets[tlet_id]) <
+                    10)
+                cond3 = tracklets[tlet_id][-1][0] != t
+                if cond1 and cond2 and cond3:
+                    matched = True
+                    tracklets[tlet_id].append((t, box[idx]))
+                    break
+            if not matched:
+                tracklet_id += 1
+                tracklet_st_frame[tracklet_id] = t
+                tracklets[tracklet_id].append((t, box[idx]))
+    return tracklets
+
+
+def drop_tracklet(tracklet):
+    tracklet = {k: v for k, v in tracklet.items() if len(v) > 5}
+
+    def meanarea(track):
+        boxes = np.stack([x[1] for x in track]).astype(np.float32)
+        areas = (boxes[..., 2] - boxes[..., 0]) * (
+            boxes[..., 3] - boxes[..., 1])
+        return np.mean(areas)
+
+    tracklet = {k: v for k, v in tracklet.items() if meanarea(v) > 5000}
+    return tracklet
+
+
+def distance_tracklet(tracklet):
+    dists = {}
+    for k, v in tracklet.items():
+        bboxes = np.stack([x[1] for x in v])
+        c_x = (bboxes[..., 2] + bboxes[..., 0]) / 2.
+        c_y = (bboxes[..., 3] + bboxes[..., 1]) / 2.
+        c_x -= 480
+        c_y -= 270
+        c = np.concatenate([c_x[..., None], c_y[..., None]], axis=1)
+        dist = np.linalg.norm(c, axis=1)
+        dists[k] = np.mean(dist)
+    return dists
+
+
+def tracklet2bbox(track, num_frame):
+    # assign_prev
+    bbox = np.zeros((num_frame, 5))
+    trackd = {}
+    for k, v in track:
+        bbox[k] = v
+        trackd[k] = v
+    for i in range(num_frame):
+        if bbox[i][-1] <= 0.5:
+            mind = np.Inf
+            for k in trackd:
+                if np.abs(k - i) < mind:
+                    mind = np.abs(k - i)
+            bbox[i] = bbox[k]
+    return bbox
+
+
+def tracklets2bbox(tracklet, num_frame):
+    dists = distance_tracklet(tracklet)
+    sorted_inds = sorted(dists, key=lambda x: dists[x])
+    dist_thre = np.Inf
+    for i in sorted_inds:
+        if len(tracklet[i]) >= num_frame / 2:
+            dist_thre = 2 * dists[i]
+            break
+
+    dist_thre = max(50, dist_thre)
+
+    bbox = np.zeros((num_frame, 5))
+    bboxd = {}
+    for idx in sorted_inds:
+        if dists[idx] < dist_thre:
+            for k, v in tracklet[idx]:
+                if bbox[k][-1] < 0.01:
+                    bbox[k] = v
+                    bboxd[k] = v
+    bad = 0
+    for idx in range(num_frame):
+        if bbox[idx][-1] < 0.01:
+            bad += 1
+            mind = np.Inf
+            mink = None
+            for k in bboxd:
+                if np.abs(k - idx) < mind:
+                    mind = np.abs(k - idx)
+                    mink = k
+            bbox[idx] = bboxd[mink]
+    return bad, bbox[:, None, :]
+
+
+def bboxes2bbox(bbox, num_frame):
+    ret = np.zeros((num_frame, 2, 5))
+    for t, item in enumerate(bbox):
+        if item.shape[0] <= 2:
+            ret[t, :item.shape[0]] = item
+        else:
+            inds = sorted(
+                list(range(item.shape[0])), key=lambda x: -item[x, -1])
+            ret[t] = item[inds[:2]]
+    for t in range(num_frame):
+        if ret[t, 0, -1] <= 0.01:
+            ret[t] = ret[t - 1]
+        elif ret[t, 1, -1] <= 0.01:
+            if t:
+                if ret[t - 1, 0, -1] > 0.01 and ret[t - 1, 1, -1] > 0.01:
+                    if iou(ret[t, 0], ret[t - 1, 0]) > iou(
+                            ret[t, 0], ret[t - 1, 1]):
+                        ret[t, 1] = ret[t - 1, 1]
+                    else:
+                        ret[t, 1] = ret[t - 1, 0]
+    return ret
+
+
+def ntu_det_postproc(vid, det_results):
+    det_results = [removedup(x) for x in det_results]
+    label = int(vid.split('/')[-1].split('A')[1][:3])
+    mpaction = list(range(50, 61)) + list(range(106, 121))
+    n_person = 2 if label in mpaction else 1
+    is_easy, bboxes = is_easy_example(det_results, n_person)
+    if is_easy:
+        print('\nEasy Example')
+        return bboxes
+
+    tracklets = bbox2tracklet(det_results)
+    tracklets = drop_tracklet(tracklets)
+
+    print(f'\nHard {n_person}-person Example, found {len(tracklets)} tracklet')
+    if n_person == 1:
+        if len(tracklets) == 1:
+            tracklet = list(tracklets.values())[0]
+            det_results = tracklet2bbox(tracklet, len(det_results))
+            return np.stack(det_results)
+        else:
+            bad, det_results = tracklets2bbox(tracklets, len(det_results))
+            return det_results
+    # n_person is 2
+    if len(tracklets) <= 2:
+        tracklets = list(tracklets.values())
+        bboxes = []
+        for tracklet in tracklets:
+            bboxes.append(tracklet2bbox(tracklet, len(det_results))[:, None])
+        bbox = np.concatenate(bboxes, axis=1)
+        return bbox
+    else:
+        return bboxes2bbox(det_results, len(det_results))
+
+
+def pose_inference_with_align(args, frame_paths, det_results):
+    # filter frame without det bbox
+    det_results = [
+        frm_dets for frm_dets in det_results if frm_dets.shape[0] > 0
+    ]
+
+    pose_results, _ = pose_inference(args.pose_config, args.pose_checkpoint,
+                                     frame_paths, det_results, args.device)
+    # align the num_person among frames
+    num_persons = max([pose['keypoints'].shape[0] for pose in pose_results])
+    num_points = pose_results[0]['keypoints'].shape[1]
+    num_frames = len(pose_results)
+    keypoints = np.zeros((num_persons, num_frames, num_points, 2),
+                         dtype=np.float32)
+    scores = np.zeros((num_persons, num_frames, num_points), dtype=np.float32)
+
+    for f_idx, frm_pose in enumerate(pose_results):
+        frm_num_persons = frm_pose['keypoints'].shape[0]
+        for p_idx in range(frm_num_persons):
+            keypoints[p_idx, f_idx] = frm_pose['keypoints'][p_idx]
+            scores[p_idx, f_idx] = frm_pose['keypoint_scores'][p_idx]
+
+    return keypoints, scores
+
+
+def ntu_pose_extraction(vid, skip_postproc=False):
+    tmp_dir = TemporaryDirectory()
+    frame_paths, _ = frame_extract(vid, out_dir=tmp_dir.name)
+    det_results, _ = detection_inference(
+        args.det_config,
+        args.det_checkpoint,
+        frame_paths,
+        args.det_score_thr,
+        device=args.device,
+        with_score=True)
+
+    if not skip_postproc:
+        det_results = ntu_det_postproc(vid, det_results)
+
+    anno = dict()
+
+    keypoints, scores = pose_inference_with_align(args, frame_paths,
+                                                  det_results)
+    anno['keypoint'] = keypoints
+    anno['keypoint_score'] = scores
+    anno['frame_dir'] = osp.splitext(osp.basename(vid))[0]
+    anno['img_shape'] = (1080, 1920)
+    anno['original_shape'] = (1080, 1920)
+    anno['total_frames'] = keypoints.shape[1]
+    anno['label'] = int(osp.basename(vid).split('A')[1][:3]) - 1
+    tmp_dir.cleanup()
+
+    return anno
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate Pose Annotation for a single NTURGB-D video')
+    parser.add_argument('video', type=str, help='source video')
+    parser.add_argument('output', type=str, help='output pickle name')
+    parser.add_argument('--device', type=str, default='cuda:0')
+    parser.add_argument('--skip-postproc', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    global_args = parse_args()
+    args.device = global_args.device
+    args.video = global_args.video
+    args.output = global_args.output
+    args.skip_postproc = global_args.skip_postproc
+    anno = ntu_pose_extraction(args.video, args.skip_postproc)
+    mmengine.dump(anno, args.output)
diff --git a/tools/data/sthv1/README.md b/tools/data/sthv1/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d8611bdfd987d8fb8e76e66ee2b7b5697c888a0
--- /dev/null
+++ b/tools/data/sthv1/README.md
@@ -0,0 +1,144 @@
+# Preparing Something-Something V1
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@misc{goyal2017something,
+      title={The "something something" video database for learning and evaluating visual common sense},
+      author={Raghav Goyal and Samira Ebrahimi Kahou and Vincent Michalski and Joanna Materzyńska and Susanne Westphal and Heuna Kim and Valentin Haenel and Ingo Fruend and Peter Yianilos and Moritz Mueller-Freitag and Florian Hoppe and Christian Thurau and Ingo Bax and Roland Memisevic},
+      year={2017},
+      eprint={1706.04261},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+For basic dataset information, you can refer to the dataset [paper](https://arxiv.org/pdf/1706.04261.pdf).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/sthv1/`.
+
+## Step 1. Prepare Annotations
+
+Since the official [website](https://20bn.com/datasets/something-something/v1) of Something-Something V1 is currently unavailable, you can download the annotations from third-part source to `$MMACTION2/data/sthv1/` .
+
+## Step 2. Prepare RGB Frames
+
+Since the official dataset doesn't provide the original video data and only extracted RGB frames are available, you have to directly download RGB frames.
+
+You can download all compressed file parts from third-part source  to `$MMACTION2/data/sthv1/` and use the following command to uncompress.
+
+```shell
+cd $MMACTION2/data/sthv1/
+cat 20bn-something-something-v1-?? | tar zx
+cd $MMACTION2/tools/data/sthv1/
+```
+
+For users who only want to use RGB frames, you can skip to step 5 to generate file lists in the format of rawframes.
+Since the prefix of official JPGs is "%05d.jpg" (e.g., "00001.jpg"), users need to add `"filename_tmpl='{:05}.jpg'"` to the dict of `data.train`, `data.val` and `data.test` in the config files related with sthv1 like this:
+
+```
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        filename_tmpl='{:05}.jpg',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        filename_tmpl='{:05}.jpg',
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        filename_tmpl='{:05}.jpg',
+        pipeline=test_pipeline))
+```
+
+## Step 3. Extract Flow
+
+This part is **optional** if you only want to use RGB frames.
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
+
+You can run the following script to soft link SSD.
+
+```shell
+# execute these two line (Assume the SSD is mounted at "/mnt/SSD/")
+mkdir /mnt/SSD/sthv1_extracted/
+ln -s /mnt/SSD/sthv1_extracted/ ../../../data/sthv1/rawframes
+```
+
+Then, you can run the following script to extract optical flow based on RGB frames.
+
+```shell
+cd $MMACTION2/tools/data/sthv1/
+bash extract_flow.sh
+```
+
+## Step 4. Encode Videos
+
+This part is **optional** if you only want to use RGB frames.
+
+You can run the following script to encode videos.
+
+```shell
+cd $MMACTION2/tools/data/sthv1/
+bash encode_videos.sh
+```
+
+## Step 5. Generate File List
+
+You can run the follow script to generate file list in the format of rawframes and videos.
+
+```shell
+cd $MMACTION2/tools/data/sthv1/
+bash generate_{rawframes, videos}_filelist.sh
+```
+
+## Step 6. Check Directory Structure
+
+After the whole data process for Something-Something V1 preparation,
+you will get the rawframes (RGB + Flow), and annotation files for Something-Something V1.
+
+In the context of the whole project (for Something-Something V1 only), the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── sthv1
+│   │   ├── sthv1_{train,val}_list_rawframes.txt
+│   │   ├── sthv1_{train,val}_list_videos.txt
+│   │   ├── annotations
+│   |   ├── videos
+│   |   |   ├── 1.mp4
+│   |   |   ├── 2.mp4
+│   |   |   ├──...
+│   |   ├── rawframes
+│   |   |   ├── 1
+│   |   |   |   ├── 00001.jpg
+│   |   |   |   ├── 00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_x_00001.jpg
+│   |   |   |   ├── flow_x_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_y_00001.jpg
+│   |   |   |   ├── flow_y_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   ├── 2
+│   |   |   ├── ...
+
+```
+
+For training and evaluating on Something-Something V1, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/sthv1/README_zh-CN.md b/tools/data/sthv1/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..b8a37d63ef81777846aa5f03cc7539b3825c7d99
--- /dev/null
+++ b/tools/data/sthv1/README_zh-CN.md
@@ -0,0 +1,142 @@
+# 准备 Something-Something V1
+
+## 简介
+
+```
+@misc{goyal2017something,
+      title={The "something something" video database for learning and evaluating visual common sense},
+      author={Raghav Goyal and Samira Ebrahimi Kahou and Vincent Michalski and Joanna Materzyńska and Susanne Westphal and Heuna Kim and Valentin Haenel and Ingo Fruend and Peter Yianilos and Moritz Mueller-Freitag and Florian Hoppe and Christian Thurau and Ingo Bax and Roland Memisevic},
+      year={2017},
+      eprint={1706.04261},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+用户可参考该数据集的 [官网](https://20bn.com/datasets/something-something/v1)，以获取数据集相关的基本信息。
+在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/sthv1/`。
+
+## 步骤 1. 下载标注文件
+
+由于 Something-Something V1 的官方网站已经失效，用户需要通过第三方源下载原始数据集。下载好的标注文件需要放在 `$MMACTION2/data/sthv1/annotations` 文件夹下。
+
+## 步骤 2. 准备 RGB 帧
+
+官方数据集并未提供原始视频文件，只提供了对原视频文件进行抽取得到的 RGB 帧，用户可在第三方源直接下载视频帧。
+
+将下载好的压缩文件放在 `$MMACTION2/data/sthv1/` 文件夹下，并使用以下脚本进行解压。
+
+```shell
+cd $MMACTION2/data/sthv1/
+cat 20bn-something-something-v1-?? | tar zx
+cd $MMACTION2/tools/data/sthv1/
+```
+
+如果用户只想使用 RGB 帧，则可以跳过中间步骤至步骤 5 以直接生成视频帧的文件列表。
+由于官网的 JPG 文件名形如 "%05d.jpg" （比如，"00001.jpg"），需要在配置文件的 `data.train`, `data.val` 和 `data.test` 处添加 `"filename_tmpl='{:05}.jpg'"` 代码，以修改文件名模板。
+
+```
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        filename_tmpl='{:05}.jpg',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        filename_tmpl='{:05}.jpg',
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        filename_tmpl='{:05}.jpg',
+        pipeline=test_pipeline))
+```
+
+## 步骤 3. 抽取光流
+
+如果用户只想使用原 RGB 帧加载训练，则该部分是 **可选项**。
+
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+如果拥有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 中。
+
+可以运行以下命令为 SSD 建立软链接。
+
+```shell
+# 执行这两行进行抽取（假设 SSD 挂载在 "/mnt/SSD/"）
+mkdir /mnt/SSD/sthv1_extracted/
+ln -s /mnt/SSD/sthv1_extracted/ ../../../data/sthv1/rawframes
+```
+
+如果想抽取光流，则可以运行以下脚本从 RGB 帧中抽取出光流。
+
+```shell
+cd $MMACTION2/tools/data/sthv1/
+bash extract_flow.sh
+```
+
+## 步骤 4: 编码视频
+
+如果用户只想使用 RGB 帧加载训练，则该部分是 **可选项**。
+
+用户可以运行以下命令进行视频编码。
+
+```shell
+cd $MMACTION2/tools/data/sthv1/
+bash encode_videos.sh
+```
+
+## 步骤 5. 生成文件列表
+
+用户可以通过运行以下命令生成帧和视频格式的文件列表。
+
+```shell
+cd $MMACTION2/tools/data/sthv1/
+bash generate_{rawframes, videos}_filelist.sh
+```
+
+## 步骤 6. 检查文件夹结构
+
+在完成所有 Something-Something V1 数据集准备流程后，
+用户可以获得对应的 RGB + 光流文件，视频文件以及标注文件。
+
+在整个 MMAction2 文件夹下，Something-Something V1 的文件结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── sthv1
+│   │   ├── sthv1_{train,val}_list_rawframes.txt
+│   │   ├── sthv1_{train,val}_list_videos.txt
+│   │   ├── annotations
+│   |   ├── videos
+│   |   |   ├── 1.mp4
+│   |   |   ├── 2.mp4
+│   |   |   ├──...
+│   |   ├── rawframes
+│   |   |   ├── 1
+│   |   |   |   ├── 00001.jpg
+│   |   |   |   ├── 00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_x_00001.jpg
+│   |   |   |   ├── flow_x_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_y_00001.jpg
+│   |   |   |   ├── flow_y_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   ├── 2
+│   |   |   ├── ...
+
+```
+
+关于对 Something-Something V1 进行训练和验证，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/sthv1/encode_videos.sh b/tools/data/sthv1/encode_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..98b0166bbe3bd38cc368466f5089ee137662d257
--- /dev/null
+++ b/tools/data/sthv1/encode_videos.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_videos.py ../../data/sthv1/rawframes/ ../../data/sthv1/videos/ --fps 12 --level 1 --start-idx 1 --filename-tmpl '%05d'
+echo "Encode videos"
+
+cd sthv1/
diff --git a/tools/data/sthv1/extract_flow.sh b/tools/data/sthv1/extract_flow.sh
new file mode 100644
index 0000000000000000000000000000000000000000..04925200f13a4c26507a5ef6c87e5ded00eb1ede
--- /dev/null
+++ b/tools/data/sthv1/extract_flow.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/sthv1/rawframes/ ../../data/sthv1/rawframes/ --task flow --level 1 --flow-type tvl1 --input-frames
+echo "Flow (tv-l1) Generated"
+cd sthv1/
diff --git a/tools/data/sthv1/generate_rawframes_filelist.sh b/tools/data/sthv1/generate_rawframes_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..090695e4c4f28a2c5622a7b9579cc8da673a9a64
--- /dev/null
+++ b/tools/data/sthv1/generate_rawframes_filelist.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py sthv1 data/sthv1/rawframes/ --rgb-prefix '0' --num-split 1 --level 1 --subset train --format rawframes --shuffle
+PYTHONPATH=. python tools/data/build_file_list.py sthv1 data/sthv1/rawframes/ --rgb-prefix '0' --num-split 1 --level 1 --subset val --format rawframes --shuffle
+echo "Filelist for rawframes generated."
+
+cd tools/data/sthv1/
diff --git a/tools/data/sthv1/generate_videos_filelist.sh b/tools/data/sthv1/generate_videos_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fd59fb31042e10d1e43bb87ed853d656a2257bd4
--- /dev/null
+++ b/tools/data/sthv1/generate_videos_filelist.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py sthv1 data/sthv1/videos/ --num-split 1 --level 1 --subset train --format videos --shuffle
+PYTHONPATH=. python tools/data/build_file_list.py sthv1 data/sthv1/videos/ --num-split 1 --level 1 --subset val --format videos --shuffle
+echo "Filelist for videos generated."
+
+cd tools/data/sthv1/
diff --git a/tools/data/sthv1/label_map.txt b/tools/data/sthv1/label_map.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b4f8782c06781e6fd5366a1c877e0a8ae53559b6
--- /dev/null
+++ b/tools/data/sthv1/label_map.txt
@@ -0,0 +1,174 @@
+Holding something
+Turning something upside down
+Turning the camera left while filming something
+Stacking number of something
+Turning the camera right while filming something
+Opening something
+Approaching something with your camera
+Picking something up
+Pushing something so that it almost falls off but doesn't
+Folding something
+Moving something away from the camera
+Closing something
+Moving away from something with your camera
+Turning the camera downwards while filming something
+Pushing something so that it slightly moves
+Turning the camera upwards while filming something
+Pretending to pick something up
+Showing something to the camera
+Moving something up
+Plugging something into something
+Unfolding something
+Putting something onto something
+Showing that something is empty
+Pretending to put something on a surface
+Taking something from somewhere
+Putting something next to something
+Moving something towards the camera
+Showing a photo of something to the camera
+Pushing something with something
+Throwing something
+Pushing something from left to right
+Something falling like a feather or paper
+Throwing something in the air and letting it fall
+Throwing something against something
+Lifting something with something on it
+Taking one of many similar things on the table
+Showing something behind something
+Putting something into something
+Tearing something just a little bit
+Moving something away from something
+Tearing something into two pieces
+Pushing something from right to left
+Holding something next to something
+Putting something, something and something on the table
+Pretending to take something from somewhere
+Moving something closer to something
+Pretending to put something next to something
+Uncovering something
+Something falling like a rock
+Putting something and something on the table
+Pouring something into something
+Moving something down
+Pulling something from right to left
+Throwing something in the air and catching it
+Tilting something with something on it until it falls off
+Putting something in front of something
+Pretending to turn something upside down
+Putting something on a surface
+Pretending to throw something
+Showing something on top of something
+Covering something with something
+Squeezing something
+Putting something similar to other things that are already on the table
+Lifting up one end of something, then letting it drop down
+Taking something out of something
+Moving part of something
+Pulling something from left to right
+Lifting something up completely without letting it drop down
+Attaching something to something
+Putting something behind something
+Moving something and something closer to each other
+Holding something in front of something
+Pushing something so that it falls off the table
+Holding something over something
+Pretending to open something without actually opening it
+Removing something, revealing something behind
+Hitting something with something
+Moving something and something away from each other
+Touching (without moving) part of something
+Pretending to put something into something
+Showing that something is inside something
+Lifting something up completely, then letting it drop down
+Pretending to take something out of something
+Holding something behind something
+Laying something on the table on its side, not upright
+Poking something so it slightly moves
+Pretending to close something without actually closing it
+Putting something upright on the table
+Dropping something in front of something
+Dropping something behind something
+Lifting up one end of something without letting it drop down
+Rolling something on a flat surface
+Throwing something onto a surface
+Showing something next to something
+Dropping something onto something
+Stuffing something into something
+Dropping something into something
+Piling something up
+Letting something roll along a flat surface
+Twisting something
+Spinning something that quickly stops spinning
+Putting number of something onto something
+Putting something underneath something
+Moving something across a surface without it falling down
+Plugging something into something but pulling it right out as you remove your hand
+Dropping something next to something
+Poking something so that it falls over
+Spinning something so it continues spinning
+Poking something so lightly that it doesn't or almost doesn't move
+Wiping something off of something
+Moving something across a surface until it falls down
+Pretending to poke something
+Putting something that cannot actually stand upright upright on the table, so it falls on its side
+Pulling something out of something
+Scooping something up with something
+Pretending to be tearing something that is not tearable
+Burying something in something
+Tipping something over
+Tilting something with something on it slightly so it doesn't fall down
+Pretending to put something onto something
+Bending something until it breaks
+Letting something roll down a slanted surface
+Trying to bend something unbendable so nothing happens
+Bending something so that it deforms
+Digging something out of something
+Pretending to put something underneath something
+Putting something on a flat surface without letting it roll
+Putting something on the edge of something so it is not supported and falls down
+Spreading something onto something
+Pretending to put something behind something
+Sprinkling something onto something
+Something colliding with something and both come to a halt
+Pushing something off of something
+Putting something that can't roll onto a slanted surface, so it stays where it is
+Lifting a surface with something on it until it starts sliding down
+Pretending or failing to wipe something off of something
+Trying but failing to attach something to something because it doesn't stick
+Pulling something from behind of something
+Pushing something so it spins
+Pouring something onto something
+Pulling two ends of something but nothing happens
+Moving something and something so they pass each other
+Pretending to sprinkle air onto something
+Putting something that can't roll onto a slanted surface, so it slides down
+Something colliding with something and both are being deflected
+Pretending to squeeze something
+Pulling something onto something
+Putting something onto something else that cannot support it so it falls down
+Lifting a surface with something on it but not enough for it to slide down
+Pouring something out of something
+Moving something and something so they collide with each other
+Tipping something with something in it over, so something in it falls out
+Letting something roll up a slanted surface, so it rolls back down
+Pretending to scoop something up with something
+Pretending to pour something out of something, but something is empty
+Pulling two ends of something so that it gets stretched
+Failing to put something into something because something does not fit
+Pretending or trying and failing to twist something
+Trying to pour something into something, but missing so it spills next to it
+Something being deflected from something
+Poking a stack of something so the stack collapses
+Spilling something onto something
+Pulling two ends of something so that it separates into two pieces
+Pouring something into something until it overflows
+Pretending to spread air onto something
+Twisting (wringing) something wet until water comes out
+Poking a hole into something soft
+Spilling something next to something
+Poking a stack of something without the stack collapsing
+Putting something onto a slanted surface but it doesn't glide down
+Pushing something onto something
+Poking something so that it spins around
+Spilling something behind something
+Poking a hole into some substance
diff --git a/tools/data/sthv2/README.md b/tools/data/sthv2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ea2f7dc1c7c8a394c4b70c74f4c06660fe3251c
--- /dev/null
+++ b/tools/data/sthv2/README.md
@@ -0,0 +1,135 @@
+# Preparing Something-Something V2
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@misc{goyal2017something,
+      title={The "something something" video database for learning and evaluating visual common sense},
+      author={Raghav Goyal and Samira Ebrahimi Kahou and Vincent Michalski and Joanna Materzyńska and Susanne Westphal and Heuna Kim and Valentin Haenel and Ingo Fruend and Peter Yianilos and Moritz Mueller-Freitag and Florian Hoppe and Christian Thurau and Ingo Bax and Roland Memisevic},
+      year={2017},
+      eprint={1706.04261},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+For basic dataset information, you can refer to the dataset [website](https://developer.qualcomm.com/software/ai-datasets/something-something).
+
+`````{tabs}
+
+````{group-tab} Download by MIM
+MIM supports downloading from OpenDataLab and preprocessing Something-Something V2 dataset with one command line.
+```Bash
+# install OpenXlab CLI tools
+pip install -U openxlab
+# log in OpenXLab
+openxlab login
+# download and preprocess by MIM
+mim download mmaction2 --dataset sthv2
+```
+
+````
+
+## Step 1. Prepare Annotations
+
+First of all, you have to sign in and download annotations to `$MMACTION2/data/sthv2/annotations` on the official [website](https://20bn.com/datasets/something-something/v2).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/sthv2/`.
+## Step 2. Prepare Videos
+
+Then, you can download all data parts to `$MMACTION2/data/sthv2/` and use the following command to uncompress.
+
+```shell
+cd $MMACTION2/data/sthv2/
+cat 20bn-something-something-v2-?? | tar zx
+cd $MMACTION2/tools/data/sthv2/
+```
+
+## Step 3. Extract RGB and Flow
+
+This part is **optional** if you only want to use the video loader.
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
+
+You can run the following script to soft link SSD.
+
+```shell
+# execute these two line (Assume the SSD is mounted at "/mnt/SSD/")
+mkdir /mnt/SSD/sthv2_extracted/
+ln -s /mnt/SSD/sthv2_extracted/ ../../../data/sthv2/rawframes
+```
+
+If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow.
+
+```shell
+cd $MMACTION2/tools/data/sthv2/
+bash extract_rgb_frames.sh
+```
+
+If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images.
+
+```shell
+cd $MMACTION2/tools/data/sthv2/
+bash extract_rgb_frames_opencv.sh
+```
+
+If both are required, run the following script to extract frames.
+
+```shell
+cd $MMACTION2/tools/data/sthv2/
+bash extract_frames.sh
+```
+
+## Step 4. Generate File List
+
+you can run the follow script to generate file list in the format of rawframes and videos.
+
+```shell
+cd $MMACTION2/tools/data/sthv2/
+bash generate_{rawframes, videos}_filelist.sh
+```
+
+````
+`````
+
+### Check Directory Structure
+
+After the whole data process for Something-Something V2 preparation,
+you will get the rawframes (RGB + Flow), videos and annotation files for Something-Something V2.
+
+In the context of the whole project (for Something-Something V2 only), the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── sthv2
+│   │   ├── sthv2_{train,val}_list_rawframes.txt(Optional)
+│   │   ├── sthv2_{train,val}_list_videos.txt
+│   │   ├── annotations(Optional)
+│   |   ├── videos
+│   |   |   ├── 1.mp4
+│   |   |   ├── 2.mp4
+│   |   |   ├──...
+│   |   ├── rawframes(Optional)
+│   |   |   ├── 1
+│   |   |   |   ├── img_00001.jpg
+│   |   |   |   ├── img_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_x_00001.jpg
+│   |   |   |   ├── flow_x_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_y_00001.jpg
+│   |   |   |   ├── flow_y_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   ├── 2
+│   |   |   ├── ...
+
+```
+
+For training and evaluating on Something-Something V2, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/sthv2/README_zh-CN.md b/tools/data/sthv2/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..da8b081f20bdfdd630c436e8f6661b695512ea9c
--- /dev/null
+++ b/tools/data/sthv2/README_zh-CN.md
@@ -0,0 +1,137 @@
+# 准备 Something-Something V2
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@misc{goyal2017something,
+      title={The "something something" video database for learning and evaluating visual common sense},
+      author={Raghav Goyal and Samira Ebrahimi Kahou and Vincent Michalski and Joanna Materzyńska and Susanne Westphal and Heuna Kim and Valentin Haenel and Ingo Fruend and Peter Yianilos and Moritz Mueller-Freitag and Florian Hoppe and Christian Thurau and Ingo Bax and Roland Memisevic},
+      year={2017},
+      eprint={1706.04261},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+用户可参考该数据集的 [官网](https://developer.qualcomm.com/software/ai-datasets/something-something)，以获取数据集相关的基本信息。
+
+`````{tabs}
+
+````{group-tab} 使用 MIM 下载
+# MIM 支持下载 Something-Something V2 数据集。用户可以通过一行命令，从 OpenDataLab 进行下载，并进行预处理。
+```Bash
+# 安装 OpenXLab CLI 工具
+pip install -U openxlab
+# 登录 OpenXLab
+openxlab login
+# 通过 MIM 进行数据集下载，预处理。注意这将花费较长时间
+mim download mmaction2 --dataset sthv2
+```
+
+````
+
+````{group-tab} 从官方源下载
+## 步骤 1. 下载标注文件
+
+首先，用户需要在 [官网](https://developer.qualcomm.com/software/ai-datasets/something-something) 完成注册，才能下载标注文件。下载好的标注文件需要放在 `$MMACTION2/data/sthv2/annotations` 文件夹下。
+用户可以使用以下命令下载标注文件。在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/sthv2/`。
+
+## 步骤 2. 准备视频
+
+之后，用户可将下载好的压缩文件放在 `$MMACTION2/data/sthv2/` 文件夹下，并且使用以下指令进行解压。
+
+```shell
+cd $MMACTION2/data/sthv2/
+cat 20bn-something-something-v2-?? | tar zx
+cd $MMACTION2/tools/data/sthv2/
+```
+
+## 步骤 3. 抽取 RGB 帧和光流
+
+如果用户只想使用视频加载训练，则该部分是 **可选项**。
+
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+如果拥有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 中。
+
+可以运行以下命令为 SSD 建立软链接。
+
+```shell
+# 执行这两行进行抽取（假设 SSD 挂载在 "/mnt/SSD/"）
+mkdir /mnt/SSD/sthv2_extracted/
+ln -s /mnt/SSD/sthv2_extracted/ ../../../data/sthv2/rawframes
+```
+
+如果用户需要抽取 RGB 帧（因为抽取光流的过程十分耗时），可以考虑运行以下命令使用 denseflow **只抽取 RGB 帧**。
+
+```shell
+cd $MMACTION2/tools/data/sthv2/
+bash extract_rgb_frames.sh
+```
+
+如果用户没有安装 denseflow，则可以运行以下命令使用 OpenCV 抽取 RGB 帧。然而，该方法只能抽取与原始视频分辨率相同的帧。
+
+```shell
+cd $MMACTION2/tools/data/sthv2/
+bash extract_rgb_frames_opencv.sh
+```
+
+如果用户想抽取 RGB 帧和光流，则可以运行以下脚本进行抽取。
+
+```shell
+cd $MMACTION2/tools/data/sthv2/
+bash extract_frames.sh
+```
+
+## 步骤 4. 生成文件列表
+
+用户可以通过运行以下命令生成帧和视频格式的文件列表。
+
+```shell
+cd $MMACTION2/tools/data/sthv2/
+bash generate_{rawframes, videos}_filelist.sh
+```
+
+````
+`````
+
+### 检查文件夹结构
+
+在完成所有 Something-Something V2 数据集准备流程后，
+用户可以获得对应的 RGB + 光流文件，视频文件以及标注文件。
+
+在整个 MMAction2 文件夹下，Something-Something V2 的文件结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── sthv2
+│   │   ├── sthv2_{train,val}_list_rawframes.txt（可选）
+│   │   ├── sthv2_{train,val}_list_videos.txt
+│   │   ├── annotations（可选）
+│   |   ├── videos
+│   |   |   ├── 1.mp4
+│   |   |   ├── 2.mp4
+│   |   |   ├──...
+│   |   ├── rawframes（可选）
+│   |   |   ├── 1
+│   |   |   |   ├── img_00001.jpg
+│   |   |   |   ├── img_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_x_00001.jpg
+│   |   |   |   ├── flow_x_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   |   ├── flow_y_00001.jpg
+│   |   |   |   ├── flow_y_00002.jpg
+│   |   |   |   ├── ...
+│   |   |   ├── 2
+│   |   |   ├── ...
+
+```
+
+关于对 Something-Something V2 进行训练和验证，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/sthv2/extract_frames.sh b/tools/data/sthv2/extract_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..575a132b7cd464a242881cbe13e1a86088a9e0ed
--- /dev/null
+++ b/tools/data/sthv2/extract_frames.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/sthv2/videos/ ../../data/sthv2/rawframes/ --task both --level 1 --flow-type tvl1 --ext webm
+echo "Raw frames (RGB and tv-l1) Generated"
+cd sthv2/
diff --git a/tools/data/sthv2/extract_rgb_frames.sh b/tools/data/sthv2/extract_rgb_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f0da858b318b12eb02b16c21b06e6e71e9a7df40
--- /dev/null
+++ b/tools/data/sthv2/extract_rgb_frames.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/sthv2/videos/ ../../data/sthv2/rawframes/ --task rgb --level 1  --ext webm
+echo "Genearte raw frames (RGB only)"
+
+cd sthv2/
diff --git a/tools/data/sthv2/extract_rgb_frames_opencv.sh b/tools/data/sthv2/extract_rgb_frames_opencv.sh
new file mode 100644
index 0000000000000000000000000000000000000000..53cca00fa0f5e144df45613f8088ae7d725ab296
--- /dev/null
+++ b/tools/data/sthv2/extract_rgb_frames_opencv.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/sthv2/videos/ ../../data/sthv2/rawframes/ --task rgb --level 1 --ext webm --use-opencv
+echo "Genearte raw frames (RGB only)"
+
+cd sthv2/
diff --git a/tools/data/sthv2/generate_rawframes_filelist.sh b/tools/data/sthv2/generate_rawframes_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e782f981b8020c8109106f5414fc631c3d207222
--- /dev/null
+++ b/tools/data/sthv2/generate_rawframes_filelist.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py sthv2 data/sthv2/rawframes/ --num-split 1 --level 1 --subset train --format rawframes --shuffle
+PYTHONPATH=. python tools/data/build_file_list.py sthv2 data/sthv2/rawframes/ --num-split 1 --level 1 --subset val --format rawframes --shuffle
+echo "Filelist for rawframes generated."
+
+cd tools/data/sthv2/
diff --git a/tools/data/sthv2/generate_videos_filelist.sh b/tools/data/sthv2/generate_videos_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..86101ac2068cb154cbec239001c9623e51388026
--- /dev/null
+++ b/tools/data/sthv2/generate_videos_filelist.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/build_file_list.py sthv2 data/sthv2/videos/ --num-split 1 --level 1 --subset train --format videos --shuffle
+PYTHONPATH=. python tools/data/build_file_list.py sthv2 data/sthv2/videos/ --num-split 1 --level 1 --subset val --format videos --shuffle
+echo "Filelist for videos generated."
+
+cd tools/data/sthv2/
diff --git a/tools/data/sthv2/label_map.txt b/tools/data/sthv2/label_map.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8202a005d3a3fdb58ba8424c83d136cfe80e6cdc
--- /dev/null
+++ b/tools/data/sthv2/label_map.txt
@@ -0,0 +1,174 @@
+Approaching something with your camera
+Attaching something to something
+Bending something so that it deforms
+Bending something until it breaks
+Burying something in something
+Closing something
+Covering something with something
+Digging something out of something
+Dropping something behind something
+Dropping something in front of something
+Dropping something into something
+Dropping something next to something
+Dropping something onto something
+Failing to put something into something because something does not fit
+Folding something
+Hitting something with something
+Holding something
+Holding something behind something
+Holding something in front of something
+Holding something next to something
+Holding something over something
+Laying something on the table on its side, not upright
+Letting something roll along a flat surface
+Letting something roll down a slanted surface
+Letting something roll up a slanted surface, so it rolls back down
+Lifting a surface with something on it but not enough for it to slide down
+Lifting a surface with something on it until it starts sliding down
+Lifting something up completely without letting it drop down
+Lifting something up completely, then letting it drop down
+Lifting something with something on it
+Lifting up one end of something without letting it drop down
+Lifting up one end of something, then letting it drop down
+Moving away from something with your camera
+Moving part of something
+Moving something across a surface until it falls down
+Moving something across a surface without it falling down
+Moving something and something away from each other
+Moving something and something closer to each other
+Moving something and something so they collide with each other
+Moving something and something so they pass each other
+Moving something away from something
+Moving something away from the camera
+Moving something closer to something
+Moving something down
+Moving something towards the camera
+Moving something up
+Opening something
+Picking something up
+Piling something up
+Plugging something into something
+Plugging something into something but pulling it right out as you remove your hand
+Poking a hole into some substance
+Poking a hole into something soft
+Poking a stack of something so the stack collapses
+Poking a stack of something without the stack collapsing
+Poking something so it slightly moves
+Poking something so lightly that it doesn't or almost doesn't move
+Poking something so that it falls over
+Poking something so that it spins around
+Pouring something into something
+Pouring something into something until it overflows
+Pouring something onto something
+Pouring something out of something
+Pretending or failing to wipe something off of something
+Pretending or trying and failing to twist something
+Pretending to be tearing something that is not tearable
+Pretending to close something without actually closing it
+Pretending to open something without actually opening it
+Pretending to pick something up
+Pretending to poke something
+Pretending to pour something out of something, but something is empty
+Pretending to put something behind something
+Pretending to put something into something
+Pretending to put something next to something
+Pretending to put something on a surface
+Pretending to put something onto something
+Pretending to put something underneath something
+Pretending to scoop something up with something
+Pretending to spread air onto something
+Pretending to sprinkle air onto something
+Pretending to squeeze something
+Pretending to take something from somewhere
+Pretending to take something out of something
+Pretending to throw something
+Pretending to turn something upside down
+Pulling something from behind of something
+Pulling something from left to right
+Pulling something from right to left
+Pulling something onto something
+Pulling something out of something
+Pulling two ends of something but nothing happens
+Pulling two ends of something so that it gets stretched
+Pulling two ends of something so that it separates into two pieces
+Pushing something from left to right
+Pushing something from right to left
+Pushing something off of something
+Pushing something onto something
+Pushing something so it spins
+Pushing something so that it almost falls off but doesn't
+Pushing something so that it falls off the table
+Pushing something so that it slightly moves
+Pushing something with something
+Putting number of something onto something
+Putting something and something on the table
+Putting something behind something
+Putting something in front of something
+Putting something into something
+Putting something next to something
+Putting something on a flat surface without letting it roll
+Putting something on a surface
+Putting something on the edge of something so it is not supported and falls down
+Putting something onto a slanted surface but it doesn't glide down
+Putting something onto something
+Putting something onto something else that cannot support it so it falls down
+Putting something similar to other things that are already on the table
+Putting something that can't roll onto a slanted surface, so it slides down
+Putting something that can't roll onto a slanted surface, so it stays where it is
+Putting something that cannot actually stand upright upright on the table, so it falls on its side
+Putting something underneath something
+Putting something upright on the table
+Putting something, something and something on the table
+Removing something, revealing something behind
+Rolling something on a flat surface
+Scooping something up with something
+Showing a photo of something to the camera
+Showing something behind something
+Showing something next to something
+Showing something on top of something
+Showing something to the camera
+Showing that something is empty
+Showing that something is inside something
+Something being deflected from something
+Something colliding with something and both are being deflected
+Something colliding with something and both come to a halt
+Something falling like a feather or paper
+Something falling like a rock
+Spilling something behind something
+Spilling something next to something
+Spilling something onto something
+Spinning something so it continues spinning
+Spinning something that quickly stops spinning
+Spreading something onto something
+Sprinkling something onto something
+Squeezing something
+Stacking number of something
+Stuffing something into something
+Taking one of many similar things on the table
+Taking something from somewhere
+Taking something out of something
+Tearing something into two pieces
+Tearing something just a little bit
+Throwing something
+Throwing something against something
+Throwing something in the air and catching it
+Throwing something in the air and letting it fall
+Throwing something onto a surface
+Tilting something with something on it slightly so it doesn't fall down
+Tilting something with something on it until it falls off
+Tipping something over
+Tipping something with something in it over, so something in it falls out
+Touching (without moving) part of something
+Trying but failing to attach something to something because it doesn't stick
+Trying to bend something unbendable so nothing happens
+Trying to pour something into something, but missing so it spills next to it
+Turning something upside down
+Turning the camera downwards while filming something
+Turning the camera left while filming something
+Turning the camera right while filming something
+Turning the camera upwards while filming something
+Twisting (wringing) something wet until water comes out
+Twisting something
+Uncovering something
+Unfolding something
+Wiping something off of something
diff --git a/tools/data/sthv2/preprocss.sh b/tools/data/sthv2/preprocss.sh
new file mode 100644
index 0000000000000000000000000000000000000000..888ebd335fe9fdd9b7fd102f5f5f91951351eccb
--- /dev/null
+++ b/tools/data/sthv2/preprocss.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+cat $DOWNLOAD_DIR/OpenDataLab___sthv2/raw/*.tar.gz  | tar -xvz -C $(dirname $DATA_ROOT)
+tar -xvf $DATA_ROOT/sthv2.tar -C $(dirname $DATA_ROOT)
+rm $DATA_ROOT/sthv2.tar
diff --git a/tools/data/thumos14/README.md b/tools/data/thumos14/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c76d82c98c1f8f52f9b93bb2850eef753affd722
--- /dev/null
+++ b/tools/data/thumos14/README.md
@@ -0,0 +1,142 @@
+# Preparing THUMOS'14
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@misc{THUMOS14,
+    author = {Jiang, Y.-G. and Liu, J. and Roshan Zamir, A. and Toderici, G. and Laptev,
+    I. and Shah, M. and Sukthankar, R.},
+    title = {{THUMOS} Challenge: Action Recognition with a Large
+    Number of Classes},
+    howpublished = "\url{http://crcv.ucf.edu/THUMOS14/}",
+    Year = {2014}
+}
+```
+
+For basic dataset information, you can refer to the dataset [website](https://www.crcv.ucf.edu/THUMOS14/download.html).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/thumos14/`.
+
+## Step 1. Prepare Annotations
+
+First of all, run the following script to prepare annotations.
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash download_annotations.sh
+```
+
+## Step 2. Prepare Videos
+
+Then, you can run the following script to prepare videos.
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash download_videos.sh
+```
+
+## Step 3. Extract RGB and Flow
+
+This part is **optional** if you only want to use the video loader.
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
+
+You can run the following script to soft link SSD.
+
+```shell
+# execute these two line (Assume the SSD is mounted at "/mnt/SSD/")
+mkdir /mnt/SSD/thumos14_extracted/
+ln -s /mnt/SSD/thumos14_extracted/ ../data/thumos14/rawframes/
+```
+
+If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow.
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash extract_rgb_frames.sh
+```
+
+If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images.
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash extract_rgb_frames_opencv.sh
+```
+
+If both are required, run the following script to extract frames.
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash extract_frames.sh tvl1
+```
+
+## Step 4. Fetch File List
+
+This part is **optional** if you do not use SSN model.
+
+You can run the follow script to fetch pre-computed tag proposals.
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash fetch_tag_proposals.sh
+```
+
+## Step 5. Denormalize Proposal File
+
+This part is **optional** if you do not use SSN model.
+
+You can run the follow script to denormalize pre-computed tag proposals according to
+actual number of local rawframes.
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash denormalize_proposal_file.sh
+```
+
+## Step 6. Check Directory Structure
+
+After the whole data process for THUMOS'14 preparation,
+you will get the rawframes (RGB + Flow), videos and annotation files for THUMOS'14.
+
+In the context of the whole project (for THUMOS'14 only), the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── thumos14
+│   │   ├── proposals
+│   │   |   ├── thumos14_tag_val_normalized_proposal_list.txt
+│   │   |   ├── thumos14_tag_test_normalized_proposal_list.txt
+│   │   ├── annotations_val
+│   │   ├── annotations_test
+│   │   ├── videos
+│   │   │   ├── val
+│   │   │   |   ├── video_validation_0000001.mp4
+│   │   │   |   ├── ...
+│   │   |   ├── test
+│   │   │   |   ├── video_test_0000001.mp4
+│   │   │   |   ├── ...
+│   │   ├── rawframes
+│   │   │   ├── val
+│   │   │   |   ├── video_validation_0000001
+|   │   │   |   │   ├── img_00001.jpg
+|   │   │   |   │   ├── img_00002.jpg
+|   │   │   |   │   ├── ...
+|   │   │   |   │   ├── flow_x_00001.jpg
+|   │   │   |   │   ├── flow_x_00002.jpg
+|   │   │   |   │   ├── ...
+|   │   │   |   │   ├── flow_y_00001.jpg
+|   │   │   |   │   ├── flow_y_00002.jpg
+|   │   │   |   │   ├── ...
+│   │   │   |   ├── ...
+│   │   |   ├── test
+│   │   │   |   ├── video_test_0000001
+```
+
+For training and evaluating on THUMOS'14, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/thumos14/README_zh-CN.md b/tools/data/thumos14/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..ec4fd86624df6ce9b8fe8a2010dcd19d56bd676d
--- /dev/null
+++ b/tools/data/thumos14/README_zh-CN.md
@@ -0,0 +1,139 @@
+# 准备 THUMOS'14
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTex
+@misc{THUMOS14,
+    author = {Jiang, Y.-G. and Liu, J. and Roshan Zamir, A. and Toderici, G. and Laptev,
+    I. and Shah, M. and Sukthankar, R.},
+    title = {{THUMOS} Challenge: Action Recognition with a Large
+    Number of Classes},
+    howpublished = "\url{http://crcv.ucf.edu/THUMOS14/}",
+    Year = {2014}
+}
+```
+
+用户可以参照数据集 [官网](https://www.crcv.ucf.edu/THUMOS14/download.html)，获取数据集相关的基本信息。
+在准备数据集前，请确保命令行当前路径为 `$MMACTION2/tools/data/thumos14/`。
+
+## 步骤 1. 下载标注文件
+
+首先，用户可使用以下命令下载标注文件。
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash download_annotations.sh
+```
+
+## 步骤 2. 下载视频
+
+之后，用户可使用以下指令下载视频
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash download_videos.sh
+```
+
+## 步骤 3. 抽取帧和光流
+
+如果用户只想使用视频加载训练，则该部分是 **可选项**。
+
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+如果用户有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 上。
+用户可使用以下命令为 SSD 建立软链接。
+
+```shell
+# 执行这两行指令进行抽取（假设 SSD 挂载在 "/mnt/SSD/"上）
+mkdir /mnt/SSD/thumos14_extracted/
+ln -s /mnt/SSD/thumos14_extracted/ ../data/thumos14/rawframes/
+```
+
+如果用户需要抽取 RGB 帧（因为抽取光流的过程十分耗时），可以考虑运行以下命令使用 denseflow **只抽取 RGB 帧**。
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash extract_rgb_frames.sh
+```
+
+如果用户没有安装 denseflow，则可以运行以下命令使用 OpenCV 抽取 RGB 帧。然而，该方法只能抽取与原始视频分辨率相同的帧。
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash extract_rgb_frames_opencv.sh
+```
+
+如果用户想抽取 RGB 帧和光流，则可以运行以下脚本进行抽取。
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash extract_frames.sh tvl1
+```
+
+## 步骤 4. 生成文件列表
+
+如果用户不使用 SSN 模型，则该部分是 **可选项**。
+
+可使用运行以下脚本下载预先计算的候选标签。
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash fetch_tag_proposals.sh
+```
+
+## 步骤 5. 去规范化候选文件
+
+如果用户不使用 SSN 模型，则该部分是 **可选项**。
+
+可运行以下脚本，来根据本地原始帧的实际数量，去规范化预先计算的候选标签。
+
+```shell
+cd $MMACTION2/tools/data/thumos14/
+bash denormalize_proposal_file.sh
+```
+
+## 步骤 6. 检查目录结构
+
+在完成 THUMOS'14 数据集准备流程后，用户可以得到 THUMOS'14 的 RGB 帧 + 光流文件，视频文件以及标注文件。
+
+在整个 MMAction2 文件夹下，THUMOS'14 的文件结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── thumos14
+│   │   ├── proposals
+│   │   |   ├── thumos14_tag_val_normalized_proposal_list.txt
+│   │   |   ├── thumos14_tag_test_normalized_proposal_list.txt
+│   │   ├── annotations_val
+│   │   ├── annotations_test
+│   │   ├── videos
+│   │   │   ├── val
+│   │   │   |   ├── video_validation_0000001.mp4
+│   │   │   |   ├── ...
+│   │   |   ├── test
+│   │   │   |   ├── video_test_0000001.mp4
+│   │   │   |   ├── ...
+│   │   ├── rawframes
+│   │   │   ├── val
+│   │   │   |   ├── video_validation_0000001
+|   │   │   |   │   ├── img_00001.jpg
+|   │   │   |   │   ├── img_00002.jpg
+|   │   │   |   │   ├── ...
+|   │   │   |   │   ├── flow_x_00001.jpg
+|   │   │   |   │   ├── flow_x_00002.jpg
+|   │   │   |   │   ├── ...
+|   │   │   |   │   ├── flow_y_00001.jpg
+|   │   │   |   │   ├── flow_y_00002.jpg
+|   │   │   |   │   ├── ...
+│   │   │   |   ├── ...
+│   │   |   ├── test
+│   │   │   |   ├── video_test_0000001
+```
+
+关于对 THUMOS'14 进行训练和验证，可以参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/thumos14/denormalize_proposal_file.sh b/tools/data/thumos14/denormalize_proposal_file.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c31f11bb3ad3a5b993c7e08c9ae44f9841c8e96e
--- /dev/null
+++ b/tools/data/thumos14/denormalize_proposal_file.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+cd ../../../
+PYTHONPATH=. python tools/data/denormalize_proposal_file.py thumos14 --norm-proposal-file data/thumos14/proposals/thumos14_tag_val_normalized_proposal_list.txt --data-prefix data/thumos14/rawframes/val/
+echo "Proposal file denormalized for val set"
+
+PYTHONPATH=. python tools/data/denormalize_proposal_file.py thumos14 --norm-proposal-file data/thumos14/proposals/thumos14_tag_test_normalized_proposal_list.txt --data-prefix data/thumos14/rawframes/test/
+echo "Proposal file denormalized for test set"
+
+cd tools/data/thumos14/
diff --git a/tools/data/thumos14/download_annotations.sh b/tools/data/thumos14/download_annotations.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2ad9d9d450f86a3778d7422b44675fe749160f13
--- /dev/null
+++ b/tools/data/thumos14/download_annotations.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+DATA_DIR="../../../data/thumos14/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+cd ${DATA_DIR}
+
+wget http://crcv.ucf.edu/THUMOS14/Validation_set/TH14_Temporal_annotations_validation.zip --no-check-certificate
+wget http://crcv.ucf.edu/THUMOS14/test_set/TH14_Temporal_annotations_test.zip --no-check-certificate
+
+if [ ! -d "./annotations_val" ]; then
+  mkdir ./annotations_val
+fi
+unzip -j TH14_Temporal_annotations_validation.zip -d annotations_val
+
+if [ ! -d "./annotations_test" ]; then
+  mkdir ./annotations_test
+fi
+unzip -j TH14_Temporal_annotations_test.zip -d annotations_test
+
+rm TH14_Temporal_annotations_validation.zip
+rm TH14_Temporal_annotations_test.zip
+
+cd "../../tools/data/thumos14/"
diff --git a/tools/data/thumos14/download_videos.sh b/tools/data/thumos14/download_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..571b080cf987737f580631bffc97a68060bd99b8
--- /dev/null
+++ b/tools/data/thumos14/download_videos.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+DATA_DIR="../../../data/thumos14/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+cd ${DATA_DIR}
+
+wget https://storage.googleapis.com/thumos14_files/TH14_validation_set_mp4.zip
+wget https://storage.googleapis.com/thumos14_files/TH14_Test_set_mp4.zip
+
+if [ ! -d "./videos/val" ]; then
+  mkdir -p ./videos/val
+fi
+unzip -j TH14_validation_set_mp4.zip -d videos/val
+
+if [ ! -d "./videos/test" ]; then
+  mkdir -p ./videos/test
+fi
+unzip -P "THUMOS14_REGISTERED" -j TH14_Test_set_mp4.zip -d videos/test
+
+cd "../../tools/data/thumos14/"
diff --git a/tools/data/thumos14/extract_frames.sh b/tools/data/thumos14/extract_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..827708691d30a46215aeb1f8fd46fec0f52c6de8
--- /dev/null
+++ b/tools/data/thumos14/extract_frames.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/thumos14/videos/val/ ../../data/thumos14/rawframes/val/ --level 1 --flow-type tvl1 --ext mp4 --task both
+echo "Raw frames (RGB and tv-l1) Generated for val set"
+
+python build_rawframes.py ../../data/thumos14/videos/test/ ../../data/thumos14/rawframes/test/ --level 1 --flow-type tvl1 --ext mp4 --task both
+echo "Raw frames (RGB and tv-l1) Generated for test set"
+
+cd thumos14/
diff --git a/tools/data/thumos14/extract_rgb_frames.sh b/tools/data/thumos14/extract_rgb_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6c92065aefdd088186146455e10847085d19577c
--- /dev/null
+++ b/tools/data/thumos14/extract_rgb_frames.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/thumos14/videos/val/ ../../data/thumos14/rawframes/val/ --level 1 --ext mp4 --task rgb
+echo "Raw frames (RGB only) generated for val set"
+
+python build_rawframes.py ../../data/thumos14/videos/test/ ../../data/thumos14/rawframes/test/ --level 1 --ext mp4 --task rgb
+echo "Raw frames (RGB only) generated for test set"
+
+cd thumos14/
diff --git a/tools/data/thumos14/extract_rgb_frames_opencv.sh b/tools/data/thumos14/extract_rgb_frames_opencv.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4048bbd7b9c9995d0e247155557790e3f50daa89
--- /dev/null
+++ b/tools/data/thumos14/extract_rgb_frames_opencv.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/thumos14/videos/val/ ../../data/thumos14/rawframes/val/ --level 1 --ext mp4 --task rgb --use-opencv
+echo "Raw frames (RGB only) generated for val set"
+
+python build_rawframes.py ../../data/thumos14/videos/test/ ../../data/thumos14/rawframes/test/ --level 1 --ext mp4 --task rgb --use-opencv
+echo "Raw frames (RGB only) generated for test set"
+
+cd thumos14/
diff --git a/tools/data/thumos14/fetch_tag_proposals.sh b/tools/data/thumos14/fetch_tag_proposals.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4a3654523952552eefc39cf033c2abd10ad0dfa6
--- /dev/null
+++ b/tools/data/thumos14/fetch_tag_proposals.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+PROP_DIR="../../../data/thumos14/proposals"
+
+if [[ ! -d "${PROP_DIR}" ]]; then
+  echo "${PROP_DIR} does not exist. Creating";
+  mkdir -p ${PROP_DIR}
+fi
+
+wget https://download.openmmlab.com/mmaction/dataset/thumos14/thumos14_tag_val_normalized_proposal_list.txt -P ${PROP_DIR}
+wget https://download.openmmlab.com/mmaction/dataset/thumos14/thumos14_tag_test_normalized_proposal_list.txt -P ${PROP_DIR}
diff --git a/tools/data/ucf101/README.md b/tools/data/ucf101/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cfe741f6a47085fab171ac9b11afb6875b7beb50
--- /dev/null
+++ b/tools/data/ucf101/README.md
@@ -0,0 +1,127 @@
+# Preparing UCF-101
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{Soomro2012UCF101AD,
+  title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild},
+  author={K. Soomro and A. Zamir and M. Shah},
+  journal={ArXiv},
+  year={2012},
+  volume={abs/1212.0402}
+}
+```
+
+For basic dataset information, you can refer to the dataset [website](https://www.crcv.ucf.edu/research/data-sets/ucf101/).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/ucf101/`.
+
+## Step 1. Prepare Annotations
+
+First of all, you can run the following script to prepare annotations.
+
+```shell
+bash download_annotations.sh
+```
+
+## Step 2. Prepare Videos
+
+Then, you can run the following script to prepare videos.
+
+```shell
+bash download_videos.sh
+```
+
+For better decoding speed, you can resize the original videos into smaller sized, densely encoded version by:
+
+```
+python ../resize_videos.py ../../../data/ucf101/videos/ ../../../data/ucf101/videos_256p_dense_cache --dense --level 2 --ext avi
+```
+
+## Step 3. Extract RGB and Flow
+
+This part is **optional** if you only want to use the video loader.
+
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+
+If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. The extracted frames (RGB + Flow) will take up about 100GB.
+
+You can run the following script to soft link SSD.
+
+```shell
+# execute these two line (Assume the SSD is mounted at "/mnt/SSD/")
+mkdir /mnt/SSD/ucf101_extracted/
+ln -s /mnt/SSD/ucf101_extracted/ ../../../data/ucf101/rawframes
+```
+
+If you only want to play with RGB frames (since extracting optical flow can be time-consuming), consider running the following script to extract **RGB-only** frames using denseflow.
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+If you didn't install denseflow, you can still extract RGB frames using OpenCV by the following script, but it will keep the original size of the images.
+
+```shell
+bash extract_rgb_frames_opencv.sh
+```
+
+If Optical Flow is also required, run the following script to extract flow using "tvl1" algorithm.
+
+```shell
+bash extract_frames.sh
+```
+
+## Step 4. Generate File List
+
+you can run the follow script to generate file list in the format of rawframes and videos.
+
+```shell
+bash generate_videos_filelist.sh
+bash generate_rawframes_filelist.sh
+```
+
+## Step 5. Check Directory Structure
+
+After the whole data process for UCF-101 preparation,
+you will get the rawframes (RGB + Flow), videos and annotation files for UCF-101.
+
+In the context of the whole project (for UCF-101 only), the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── ucf101
+│   │   ├── ucf101_{train,val}_split_{1,2,3}_rawframes.txt
+│   │   ├── ucf101_{train,val}_split_{1,2,3}_videos.txt
+│   │   ├── annotations
+│   │   ├── videos
+│   │   │   ├── ApplyEyeMakeup
+│   │   │   │   ├── v_ApplyEyeMakeup_g01_c01.avi
+
+│   │   │   ├── YoYo
+│   │   │   │   ├── v_YoYo_g25_c05.avi
+│   │   ├── rawframes
+│   │   │   ├── ApplyEyeMakeup
+│   │   │   │   ├── v_ApplyEyeMakeup_g01_c01
+│   │   │   │   │   ├── img_00001.jpg
+│   │   │   │   │   ├── img_00002.jpg
+│   │   │   │   │   ├── ...
+│   │   │   │   │   ├── flow_x_00001.jpg
+│   │   │   │   │   ├── flow_x_00002.jpg
+│   │   │   │   │   ├── ...
+│   │   │   │   │   ├── flow_y_00001.jpg
+│   │   │   │   │   ├── flow_y_00002.jpg
+│   │   │   ├── ...
+│   │   │   ├── YoYo
+│   │   │   │   ├── v_YoYo_g01_c01
+│   │   │   │   ├── ...
+│   │   │   │   ├── v_YoYo_g25_c05
+
+```
+
+For training and evaluating on UCF-101, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/ucf101/README_zh-CN.md b/tools/data/ucf101/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..eb9ed7bc6f0d9e5e9e63753b48c671aa13e869d8
--- /dev/null
+++ b/tools/data/ucf101/README_zh-CN.md
@@ -0,0 +1,125 @@
+# 准备 UCF-101
+
+## 简介
+
+```BibTeX
+@article{Soomro2012UCF101AD,
+  title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild},
+  author={K. Soomro and A. Zamir and M. Shah},
+  journal={ArXiv},
+  year={2012},
+  volume={abs/1212.0402}
+}
+```
+
+用户可参考该数据集的 [官网](https://www.crcv.ucf.edu/research/data-sets/ucf101/)，以获取数据集相关的基本信息。
+在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/ucf101/`。
+
+## 步骤 1. 下载标注文件
+
+首先，用户可运行以下脚本下载标注文件。
+
+```shell
+bash download_annotations.sh
+```
+
+## 步骤 2. 准备视频文件
+
+之后，用户可运行以下脚本准备视频文件。
+
+```shell
+bash download_videos.sh
+```
+
+用户可使用以下脚本，对原视频进行裁剪，得到密集编码且更小尺寸的视频。
+
+```
+python ../resize_videos.py ../../../data/ucf101/videos/ ../../../data/ucf101/videos_256p_dense_cache --dense --level 2 --ext avi
+```
+
+## 步骤 3. 抽取视频帧和光流
+
+如果用户只想使用视频加载训练，则该部分是 **可选项**。
+
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+
+如果拥有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 中。所抽取的视频帧和光流约占据 100 GB 的存储空间。
+
+可以运行以下命令为 SSD 建立软链接。
+
+```shell
+# 执行这两行进行抽取（假设 SSD 挂载在 "/mnt/SSD/"）
+mkdir /mnt/SSD/ucf101_extracted/
+ln -s /mnt/SSD/ucf101_extracted/ ../../../data/ucf101/rawframes
+```
+
+如果用户需要抽取 RGB 帧（因为抽取光流的过程十分耗时），可以考虑运行以下命令使用 denseflow **只抽取 RGB 帧**。
+
+```shell
+bash extract_rgb_frames.sh
+```
+
+如果用户没有安装 denseflow，则可以运行以下命令使用 OpenCV 抽取 RGB 帧。然而，该方法只能抽取与原始视频分辨率相同的帧。
+
+```shell
+bash extract_rgb_frames_opencv.sh
+```
+
+如果用户想抽取 RGB 帧和光流，则可以运行以下脚本使用 "tvl1" 算法进行抽取。
+
+```shell
+bash extract_frames.sh
+```
+
+## 步骤 4. 生成文件列表
+
+用户可以通过运行以下命令生成帧和视频格式的文件列表。
+
+```shell
+bash generate_videos_filelist.sh
+bash generate_rawframes_filelist.sh
+```
+
+## 步骤 5. 检查文件夹结构
+
+在完成所有 UCF-101 数据集准备流程后，
+用户可以获得对应的 RGB + 光流文件，视频文件以及标注文件。
+
+在整个 MMAction2 文件夹下，UCF-101 的文件结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── ucf101
+│   │   ├── ucf101_{train,val}_split_{1,2,3}_rawframes.txt
+│   │   ├── ucf101_{train,val}_split_{1,2,3}_videos.txt
+│   │   ├── annotations
+│   │   ├── videos
+│   │   │   ├── ApplyEyeMakeup
+│   │   │   │   ├── v_ApplyEyeMakeup_g01_c01.avi
+
+│   │   │   ├── YoYo
+│   │   │   │   ├── v_YoYo_g25_c05.avi
+│   │   ├── rawframes
+│   │   │   ├── ApplyEyeMakeup
+│   │   │   │   ├── v_ApplyEyeMakeup_g01_c01
+│   │   │   │   │   ├── img_00001.jpg
+│   │   │   │   │   ├── img_00002.jpg
+│   │   │   │   │   ├── ...
+│   │   │   │   │   ├── flow_x_00001.jpg
+│   │   │   │   │   ├── flow_x_00002.jpg
+│   │   │   │   │   ├── ...
+│   │   │   │   │   ├── flow_y_00001.jpg
+│   │   │   │   │   ├── flow_y_00002.jpg
+│   │   │   ├── ...
+│   │   │   ├── YoYo
+│   │   │   │   ├── v_YoYo_g01_c01
+│   │   │   │   ├── ...
+│   │   │   │   ├── v_YoYo_g25_c05
+
+```
+
+关于对 UCF-101 进行训练和验证，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/ucf101/download_annotations.sh b/tools/data/ucf101/download_annotations.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b3dc2a90db52a05d9ab92396bb93cd50de99c869
--- /dev/null
+++ b/tools/data/ucf101/download_annotations.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+DATA_DIR="../../../data/ucf101/annotations"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget https://www.crcv.ucf.edu/wp-content/uploads/2019/03/UCF101TrainTestSplits-RecognitionTask.zip --no-check-certificate
+
+unzip -j UCF101TrainTestSplits-RecognitionTask.zip -d ${DATA_DIR}/
+rm UCF101TrainTestSplits-RecognitionTask.zip
diff --git a/tools/data/ucf101/download_videos.sh b/tools/data/ucf101/download_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a7d7600e1417a490c1cd8ec5c914116aba80ae32
--- /dev/null
+++ b/tools/data/ucf101/download_videos.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+DATA_DIR="../../../data/ucf101/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+cd ${DATA_DIR}
+
+wget https://www.crcv.ucf.edu/datasets/human-actions/ucf101/UCF101.rar --no-check-certificate
+unrar x UCF101.rar
+mv ./UCF-101 ./videos
+
+cd "../../tools/data/ucf101"
diff --git a/tools/data/ucf101/extract_frames.sh b/tools/data/ucf101/extract_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..44da782c4e6d4b59e321f43d1c109dda4b07bb0d
--- /dev/null
+++ b/tools/data/ucf101/extract_frames.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/ucf101/videos/ ../../data/ucf101/rawframes/ --task both --level 2 --flow-type tvl1
+echo "Raw frames (RGB and Flow) Generated"
+cd ucf101/
diff --git a/tools/data/ucf101/extract_rgb_frames.sh b/tools/data/ucf101/extract_rgb_frames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..71916e28e0574de1d66c436d456f4990b4c44254
--- /dev/null
+++ b/tools/data/ucf101/extract_rgb_frames.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/ucf101/videos/ ../../data/ucf101/rawframes/ --task rgb --level 2  --ext avi
+echo "Genearte raw frames (RGB only)"
+
+cd ucf101/
diff --git a/tools/data/ucf101/extract_rgb_frames_opencv.sh b/tools/data/ucf101/extract_rgb_frames_opencv.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7cc3de219499900f0777ef3e3508aac3422cbcd7
--- /dev/null
+++ b/tools/data/ucf101/extract_rgb_frames_opencv.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/ucf101/videos/ ../../data/ucf101/rawframes/ --task rgb --level 2 --ext avi --use-opencv
+echo "Genearte raw frames (RGB only)"
+
+cd ucf101/
diff --git a/tools/data/ucf101/generate_rawframes_filelist.sh b/tools/data/ucf101/generate_rawframes_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2b0bcd20b457b2655918d1a25ea378b014f467f1
--- /dev/null
+++ b/tools/data/ucf101/generate_rawframes_filelist.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../../../
+
+PYTHONPATH=. python tools/data/build_file_list.py ucf101 data/ucf101/rawframes/ --level 2 --format rawframes --shuffle
+echo "Filelist for rawframes generated."
+
+cd tools/data/ucf101/
diff --git a/tools/data/ucf101/generate_videos_filelist.sh b/tools/data/ucf101/generate_videos_filelist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ef72ca9e3f65c52c206a6fdbd70622401ca30ccb
--- /dev/null
+++ b/tools/data/ucf101/generate_videos_filelist.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd ../../../
+
+PYTHONPATH=. python tools/data/build_file_list.py ucf101 data/ucf101/videos/ --level 2 --format videos --shuffle
+echo "Filelist for videos generated."
+
+cd tools/data/ucf101/
diff --git a/tools/data/ucf101/label_map.txt b/tools/data/ucf101/label_map.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f5c18b454e876330e19a5c8b09663dee4d8f9ad4
--- /dev/null
+++ b/tools/data/ucf101/label_map.txt
@@ -0,0 +1,101 @@
+ApplyEyeMakeup
+ApplyLipstick
+Archery
+BabyCrawling
+BalanceBeam
+BandMarching
+BaseballPitch
+Basketball
+BasketballDunk
+BenchPress
+Biking
+Billiards
+BlowDryHair
+BlowingCandles
+BodyWeightSquats
+Bowling
+BoxingPunchingBag
+BoxingSpeedBag
+BreastStroke
+BrushingTeeth
+CleanAndJerk
+CliffDiving
+CricketBowling
+CricketShot
+CuttingInKitchen
+Diving
+Drumming
+Fencing
+FieldHockeyPenalty
+FloorGymnastics
+FrisbeeCatch
+FrontCrawl
+GolfSwing
+Haircut
+Hammering
+HammerThrow
+HandstandPushups
+HandstandWalking
+HeadMassage
+HighJump
+HorseRace
+HorseRiding
+HulaHoop
+IceDancing
+JavelinThrow
+JugglingBalls
+JumpingJack
+JumpRope
+Kayaking
+Knitting
+LongJump
+Lunges
+MilitaryParade
+Mixing
+MoppingFloor
+Nunchucks
+ParallelBars
+PizzaTossing
+PlayingCello
+PlayingDaf
+PlayingDhol
+PlayingFlute
+PlayingGuitar
+PlayingPiano
+PlayingSitar
+PlayingTabla
+PlayingViolin
+PoleVault
+PommelHorse
+PullUps
+Punch
+PushUps
+Rafting
+RockClimbingIndoor
+RopeClimbing
+Rowing
+SalsaSpin
+ShavingBeard
+Shotput
+SkateBoarding
+Skiing
+Skijet
+SkyDiving
+SoccerJuggling
+SoccerPenalty
+StillRings
+SumoWrestling
+Surfing
+Swing
+TableTennisShot
+TaiChi
+TennisSwing
+ThrowDiscus
+TrampolineJumping
+Typing
+UnevenBars
+VolleyballSpiking
+WalkingWithDog
+WallPushups
+WritingOnBoard
+YoYo
diff --git a/tools/data/ucf101_24/README.md b/tools/data/ucf101_24/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3418ce8d5bb0ca126c7490de98ce96b59424a6fc
--- /dev/null
+++ b/tools/data/ucf101_24/README.md
@@ -0,0 +1,89 @@
+# Preparing UCF101-24
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{Soomro2012UCF101AD,
+  title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild},
+  author={K. Soomro and A. Zamir and M. Shah},
+  journal={ArXiv},
+  year={2012},
+  volume={abs/1212.0402}
+}
+```
+
+For basic dataset information, you can refer to the dataset [website](http://www.thumos.info/download.html).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/ucf101_24/`.
+
+## Download and Extract
+
+You can download the RGB frames, optical flow and ground truth annotations from [google drive](https://drive.google.com/drive/folders/1BvGywlAGrACEqRyfYbz3wzlVV3cDFkct).
+The data are provided from [MOC](https://github.com/MCG-NJU/MOC-Detector/blob/master/readme/Dataset.md), which is adapted from [act-detector](https://github.com/vkalogeiton/caffe/tree/act-detector) and [corrected-UCF101-Annots](https://github.com/gurkirt/corrected-UCF101-Annots).
+
+:::{note}
+The annotation of this UCF101-24 is from [here](https://github.com/gurkirt/corrected-UCF101-Annots), which is more correct.
+:::
+
+After downloading the `UCF101_v2.tar.gz` file and put it in `$MMACTION2/tools/data/ucf101_24/`, you can run the following command to uncompress.
+
+```shell
+tar -zxvf UCF101_v2.tar.gz
+```
+
+## Check Directory Structure
+
+After uncompressing, you will get the `rgb-images` directory, `brox-images` directory and `UCF101v2-GT.pkl` for UCF101-24.
+
+In the context of the whole project (for UCF101-24 only), the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── ucf101_24
+│   |   ├── brox-images
+│   |   |   ├── Basketball
+│   |   |   |   ├── v_Basketball_g01_c01
+│   |   |   |   |   ├── 00001.jpg
+│   |   |   |   |   ├── 00002.jpg
+│   |   |   |   |   ├── ...
+│   |   |   |   |   ├── 00140.jpg
+│   |   |   |   |   ├── 00141.jpg
+│   |   |   ├── ...
+│   |   |   ├── WalkingWithDog
+│   |   |   |   ├── v_WalkingWithDog_g01_c01
+│   |   |   |   ├── ...
+│   |   |   |   ├── v_WalkingWithDog_g25_c04
+│   |   ├── rgb-images
+│   |   |   ├── Basketball
+│   |   |   |   ├── v_Basketball_g01_c01
+│   |   |   |   |   ├── 00001.jpg
+│   |   |   |   |   ├── 00002.jpg
+│   |   |   |   |   ├── ...
+│   |   |   |   |   ├── 00140.jpg
+│   |   |   |   |   ├── 00141.jpg
+│   |   |   ├── ...
+│   |   |   ├── WalkingWithDog
+│   |   |   |   ├── v_WalkingWithDog_g01_c01
+│   |   |   |   ├── ...
+│   |   |   |   ├── v_WalkingWithDog_g25_c04
+│   |   ├── UCF101v2-GT.pkl
+
+```
+
+:::{note}
+The `UCF101v2-GT.pkl` exists as a cache, it contains 6 items as follows:
+:::
+
+1. `labels` (list): List of the 24 labels.
+2. `gttubes` (dict): Dictionary that contains the ground truth tubes for each video.
+   A **gttube** is dictionary that associates with each index of label and a list of tubes.
+   A **tube** is a numpy array with `nframes` rows and 5 columns, each col is in format like `<frame index> <x1> <y1> <x2> <y2>`.
+3. `nframes` (dict): Dictionary that contains the number of frames for each video, like `'HorseRiding/v_HorseRiding_g05_c02': 151`.
+4. `train_videos` (list): A list with `nsplits=1` elements, each one containing the list of training videos.
+5. `test_videos` (list): A list with `nsplits=1` elements, each one containing the list of testing videos.
+6. `resolution` (dict): Dictionary that outputs a tuple (h,w) of the resolution for each video, like `'FloorGymnastics/v_FloorGymnastics_g09_c03': (240, 320)`.
diff --git a/tools/data/ucf101_24/README_zh-CN.md b/tools/data/ucf101_24/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..8e6da66bfe5ab165e560aa9694f239ac5c3e6ab6
--- /dev/null
+++ b/tools/data/ucf101_24/README_zh-CN.md
@@ -0,0 +1,84 @@
+# 准备 UCF101-24
+
+## 简介
+
+```BibTeX
+@article{Soomro2012UCF101AD,
+  title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild},
+  author={K. Soomro and A. Zamir and M. Shah},
+  journal={ArXiv},
+  year={2012},
+  volume={abs/1212.0402}
+}
+```
+
+用户可参考该数据集的 [官网](http://www.thumos.info/download.html)，以获取数据集相关的基本信息。
+在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/ucf101_24/`。
+
+## 下载和解压
+
+用户可以从 [这里](https://drive.google.com/drive/folders/1BvGywlAGrACEqRyfYbz3wzlVV3cDFkct) 下载 RGB 帧，光流和标注文件。
+该数据由 [MOC](https://github.com/MCG-NJU/MOC-Detector/blob/master/readme/Dataset.md) 代码库提供，
+参考自 [act-detector](https://github.com/vkalogeiton/caffe/tree/act-detector) 和 [corrected-UCF101-Annots](https://github.com/gurkirt/corrected-UCF101-Annots)。
+
+**注意**：UCF101-24 的标注文件来自于 [这里](https://github.com/gurkirt/corrected-UCF101-Annots)，该标注文件相对于其他标注文件更加准确。
+
+用户在下载 `UCF101_v2.tar.gz` 文件后，需将其放置在 `$MMACTION2/tools/data/ucf101_24/` 目录下，并使用以下指令进行解压：
+
+```shell
+tar -zxvf UCF101_v2.tar.gz
+```
+
+## 检查文件夹结构
+
+经过解压后，用户将得到 `rgb-images` 文件夹，`brox-images` 文件夹和 `UCF101v2-GT.pkl` 文件。
+
+在整个 MMAction2 文件夹下，UCF101_24 的文件结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── ucf101_24
+│   |   ├── brox-images
+│   |   |   ├── Basketball
+│   |   |   |   ├── v_Basketball_g01_c01
+│   |   |   |   |   ├── 00001.jpg
+│   |   |   |   |   ├── 00002.jpg
+│   |   |   |   |   ├── ...
+│   |   |   |   |   ├── 00140.jpg
+│   |   |   |   |   ├── 00141.jpg
+│   |   |   ├── ...
+│   |   |   ├── WalkingWithDog
+│   |   |   |   ├── v_WalkingWithDog_g01_c01
+│   |   |   |   ├── ...
+│   |   |   |   ├── v_WalkingWithDog_g25_c04
+│   |   ├── rgb-images
+│   |   |   ├── Basketball
+│   |   |   |   ├── v_Basketball_g01_c01
+│   |   |   |   |   ├── 00001.jpg
+│   |   |   |   |   ├── 00002.jpg
+│   |   |   |   |   ├── ...
+│   |   |   |   |   ├── 00140.jpg
+│   |   |   |   |   ├── 00141.jpg
+│   |   |   ├── ...
+│   |   |   ├── WalkingWithDog
+│   |   |   |   ├── v_WalkingWithDog_g01_c01
+│   |   |   |   ├── ...
+│   |   |   |   ├── v_WalkingWithDog_g25_c04
+│   |   ├── UCF101v2-GT.pkl
+
+```
+
+**注意**：`UCF101v2-GT.pkl` 作为一个缓存文件，它包含 6 个项目：
+
+1. `labels` (list)：24 个行为类别名称组成的列表
+2. `gttubes` (dict)：每个视频对应的基准 tubes 组成的字典
+   **gttube** 是由标签索引和 tube 列表组成的字典
+   **tube** 是一个 `nframes` 行和 5 列的 numpy array，每一列的形式如 `<frame index> <x1> <y1> <x2> <y2>`
+3. `nframes` (dict)：用以表示每个视频对应的帧数，如 `'HorseRiding/v_HorseRiding_g05_c02': 151`
+4. `train_videos` (list)：包含 `nsplits=1` 的元素，每一项都包含了训练视频的列表
+5. `test_videos` (list)：包含 `nsplits=1` 的元素，每一项都包含了测试视频的列表
+6. `resolution` (dict)：每个视频对应的分辨率（形如 (h,w)），如 `'FloorGymnastics/v_FloorGymnastics_g09_c03': (240, 320)`
diff --git a/tools/data/video_retrieval/README.md b/tools/data/video_retrieval/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..09c35491dcccc135a81a1d6e32a9da63efd11d68
--- /dev/null
+++ b/tools/data/video_retrieval/README.md
@@ -0,0 +1,83 @@
+# Preparing Video Retrieval Datasets
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{xu2016msr,
+      title={Msr-vtt: A large video description dataset for bridging video and language},
+      author={Xu, Jun and Mei, Tao and Yao, Ting and Rui, Yong},
+      booktitle={CVPR},
+      pages={5288--5296},
+      year={2016}
+}
+```
+
+```BibTeX
+@inproceedings{chen2011collecting,
+  title={Collecting highly parallel data for paraphrase evaluation},
+  author={Chen, David and Dolan, William B},
+  booktitle={ACL},
+  pages={190--200},
+  year={2011}
+}
+```
+
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/video_retrieval/`.
+
+## Preparing MSRVTT dataset
+
+For basic dataset information, you can refer to the MSRVTT dataset [website](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/). Run the following command to prepare the MSRVTT dataset:
+
+```shell
+bash prepare_msrvtt.sh
+```
+
+After preparation, the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── video_retrieval
+│   │   └── msrvtt
+│   │       ├── train_9k.json
+│   │       ├── train_7k.json
+│   │       ├── test_JSFUSION.json
+│   │       └─── videos
+│   │           ├── video0.mp4
+│   │           ├── video1.mp4
+│   │           ├── ...
+│   │           └── video9999.mp4
+```
+
+## Preparing MSVD dataset
+
+For basic dataset information, you can refer to the MSVD dataset [website](https://www.cs.utexas.edu/users/ml/clamp/videoDescription/). Run the following command to prepare the MSVD dataset:
+
+```shell
+bash prepare_msvd.sh
+```
+
+After preparation, the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── video_retrieval
+│   │   └── msrvd
+│   │       ├── train.json
+│   │       ├── test.json
+│   │       ├── val.json
+│   │       └─── videos
+│   │           ├── xxx.avi
+│   │           ├── xxx.avi
+│   │           ├── ...
+│   │           └── xxx.avi
+```
diff --git a/tools/data/video_retrieval/README_zh-CN.md b/tools/data/video_retrieval/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..c19b84a2777ec22cca67bec001e29ecd30fd06c1
--- /dev/null
+++ b/tools/data/video_retrieval/README_zh-CN.md
@@ -0,0 +1,83 @@
+# 准备视频检索数据集
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{xu2016msr,
+      title={Msr-vtt: A large video description dataset for bridging video and language},
+      author={Xu, Jun and Mei, Tao and Yao, Ting and Rui, Yong},
+      booktitle={CVPR},
+      pages={5288--5296},
+      year={2016}
+}
+```
+
+```BibTeX
+@inproceedings{chen2011collecting,
+  title={Collecting highly parallel data for paraphrase evaluation},
+  author={Chen, David and Dolan, William B},
+  booktitle={ACL},
+  pages={190--200},
+  year={2011}
+}
+```
+
+在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/video_retrieval/`。
+
+## 准备 MSRVTT 数据集
+
+用户可参考该数据集的[官网](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/)，以获取数据集相关的基本信息。运行下面的命令准备 MSRVTT 数据集：
+
+```shell
+bash prepare_msrvtt.sh
+```
+
+完成上述准备步骤后，文件目录如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── video_retrieval
+│   │   └── msrvtt
+│   │       ├── train_9k.json
+│   │       ├── train_7k.json
+│   │       ├── test_JSFUSION.json
+│   │       └─── videos
+│   │           ├── video0.mp4
+│   │           ├── video1.mp4
+│   │           ├── ...
+│   │           └── video9999.mp4
+```
+
+## 准备 MSVD 数据集
+
+用户可参考该数据集的[官网](https://www.cs.utexas.edu/users/ml/clamp/videoDescription/)，以获取数据集相关的基本信息。运行下面的命令准备 MSVD 数据集：
+
+```shell
+bash prepare_msvd.sh
+```
+
+完场上述准备步骤后，文件目录如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── video_retrieval
+│   │   └── msvd
+│   │       ├── train.json
+│   │       ├── text.json
+│   │       ├── val.json
+│   │       └─── videos
+│   │           ├── xxx.avi
+│   │           ├── xxx.avi
+│   │           ├── ...
+│   │           └── xxx.avi
+```
diff --git a/tools/data/video_retrieval/prepare_msrvtt.py b/tools/data/video_retrieval/prepare_msrvtt.py
new file mode 100644
index 0000000000000000000000000000000000000000..09fac7659d49c1b095fb06388df959dce56c8617
--- /dev/null
+++ b/tools/data/video_retrieval/prepare_msrvtt.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+
+import pandas as pd
+
+DATA_DIR = '../../../data/video_retrieval/msrvtt'
+SUFFIX = '.mp4'
+
+raw_data_path = osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_data.json')
+train_csv_path = [
+    osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_train.9k.csv'),
+    osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_train.7k.csv')
+]
+test_csv_path = osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_JSFUSION_test.csv')
+train_json_path = [
+    osp.join(DATA_DIR, 'train_9k.json'),
+    osp.join(DATA_DIR, 'train_7k.json')
+]
+test_json_path = osp.join(DATA_DIR, 'test_JSFUSION.json')
+
+with open(raw_data_path, 'r') as f:
+    data = json.load(f)
+
+sentences = data['sentences']
+video_dict = {}
+for sentence in sentences:
+    caption = sentence['caption']
+    video_id = sentence['video_id']
+    if video_id not in video_dict:
+        video_dict[video_id] = []
+    video_dict[video_id].append(caption)
+
+for ip, op in zip(train_csv_path, train_json_path):
+    train_csv = pd.read_csv(ip)
+    train_video_ids = list(train_csv['video_id'].values)
+    train_video_dict = {}
+    for video_id in train_video_ids:
+        train_video_dict[video_id + SUFFIX] = video_dict[video_id]
+
+    with open(op, 'w') as f:
+        json.dump(train_video_dict, f)
+
+test_data = pd.read_csv(test_csv_path)
+
+test_video_dict = {}
+for video_id, sentence in zip(test_data['video_id'], test_data['sentence']):
+    test_video_dict[video_id + SUFFIX] = [sentence]
+
+with open(test_json_path, 'w') as f:
+    json.dump(test_video_dict, f)
diff --git a/tools/data/video_retrieval/prepare_msrvtt.sh b/tools/data/video_retrieval/prepare_msrvtt.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c0ee97d56ad3d28831b40fadf5bb8fbb652a6781
--- /dev/null
+++ b/tools/data/video_retrieval/prepare_msrvtt.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+DATA_DIR="../../../data/video_retrieval/msrvtt"
+mkdir -p ${DATA_DIR}
+
+
+if [ -f "msrvtt_data.zip" ]; then
+    echo "msrvtt_data.zip exists, skip downloading!"
+else
+    echo "Downloading msrvtt_data.zip."
+    wget https://github.com/ArrowLuo/CLIP4Clip/releases/download/v0.0/msrvtt_data.zip
+fi
+
+echo "Processing annotations started."
+unzip -q msrvtt_data.zip -d ${DATA_DIR}
+python prepare_msrvtt.py
+echo "Processing annotations completed."
+
+if [ -f "MSRVTT.zip" ]; then
+    echo "MSRVTT.zip exists, skip downloading!"
+else
+    echo "Downloading MSRVTT.zip."
+    wget https://www.robots.ox.ac.uk/~maxbain/frozen-in-time/data/MSRVTT.zip
+fi
+
+echo "Processing videos started."
+unzip -q MSRVTT.zip -d ${DATA_DIR}
+mkdir -p "${DATA_DIR}/videos/" && find "${DATA_DIR}/MSRVTT/videos/all" -name "video*.mp4" -exec mv {} "${DATA_DIR}/videos/" \;
+echo "Processing videos completed."
+
+rm -rf "${DATA_DIR}/MSRVTT"
+rm -rf "${DATA_DIR}/msrvtt_data"
+rm msrvtt_data.zip
+rm MSRVTT.zip
+echo "The preparation of the msrvtt dataset has been successfully completed."
diff --git a/tools/data/video_retrieval/prepare_msvd.py b/tools/data/video_retrieval/prepare_msvd.py
new file mode 100644
index 0000000000000000000000000000000000000000..e53813e4644cad2eb5d279660ca8964c975b91dc
--- /dev/null
+++ b/tools/data/video_retrieval/prepare_msvd.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import pickle
+
+DATA_DIR = '../../../data/video_retrieval/msvd'
+SUFFIX = '.avi'
+
+data_path = osp.join(DATA_DIR, 'msvd_data/raw-captions.pkl')
+train_txt_path = osp.join(DATA_DIR, 'msvd_data/train_list.txt')
+test_txt_path = osp.join(DATA_DIR, 'msvd_data/test_list.txt')
+val_txt_path = osp.join(DATA_DIR, 'msvd_data/val_list.txt')
+train_json_path = osp.join(DATA_DIR, 'train.json')
+test_json_path = osp.join(DATA_DIR, 'test.json')
+val_json_path = osp.join(DATA_DIR, 'val.json')
+
+with open(data_path, 'rb') as F:
+    data = pickle.load(F)
+
+video_dict = {}
+for one_data in data:
+    caption = data[one_data]
+    if one_data not in video_dict:
+        video_dict[one_data] = []
+    for cap in caption:
+        video_dict[one_data].append(' '.join(cap))
+
+with open(train_txt_path, 'r') as f:
+    train_avi = f.readlines()
+
+train_avi_list = {}
+for video in train_avi:
+    train_avi_list[video.strip() + SUFFIX] = video_dict[video.strip()]
+
+with open(train_json_path, 'w') as f:
+    json.dump(train_avi_list, f)
+
+with open(test_txt_path, 'r') as f:
+    test_avi = f.readlines()
+
+test_avi_list = {}
+for video in test_avi:
+    test_avi_list[video.strip() + SUFFIX] = video_dict[video.strip()]
+with open(test_json_path, 'w') as f:
+    json.dump(test_avi_list, f)
+
+with open(val_txt_path, 'r') as f:
+    val_avi = f.readlines()
+
+val_avi_list = {}
+for video in val_avi:
+    val_avi_list[video.strip() + SUFFIX] = video_dict[video.strip()]
+
+with open(val_json_path, 'w') as f:
+    json.dump(val_avi_list, f)
diff --git a/tools/data/video_retrieval/prepare_msvd.sh b/tools/data/video_retrieval/prepare_msvd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..365ac16fade8ea5dbf91560bbdd0d26f0a48ac8d
--- /dev/null
+++ b/tools/data/video_retrieval/prepare_msvd.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+DATA_DIR="../../../data/video_retrieval/msvd"
+mkdir -p ${DATA_DIR}
+
+
+if [ -f "msvd_data.zip" ]; then
+    echo "msvd_data.zip exists, skip downloading!"
+else
+    echo "Downloading msvd_data.zip."
+    wget https://github.com/ArrowLuo/CLIP4Clip/releases/download/v0.0/msvd_data.zip
+fi
+
+echo "Processing annotations started."
+unzip -q msvd_data.zip -d ${DATA_DIR}
+python prepare_msvd.py
+echo "Processing annotations completed."
+
+if [ -f "YouTubeClips.tar" ]; then
+    echo "YouTubeClips.tar exists, skip downloading!"
+else
+    echo "Downloading YouTubeClips.tar."
+    wget https://www.cs.utexas.edu/users/ml/clamp/videoDescription/YouTubeClips.tar
+fi
+
+echo "Processing videos started."
+tar -xf YouTubeClips.tar -C ${DATA_DIR}
+mkdir -p "${DATA_DIR}/videos/" && find "${DATA_DIR}/YouTubeClips" -name "*.avi" -exec mv {} "${DATA_DIR}/videos/" \;
+echo "Processing videos completed."
+
+rm -rf "${DATA_DIR}/YouTubeClips"
+rm -rf "${DATA_DIR}/msvd_data"
+rm msvd_data.zip
+rm YouTubeClips.tar
+echo "The preparation of the msvd dataset has been successfully completed."
diff --git a/tools/deployment/export_onnx_gcn.py b/tools/deployment/export_onnx_gcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..07c70e03ccbc96b6eb548929967db24fb78121ed
--- /dev/null
+++ b/tools/deployment/export_onnx_gcn.py
@@ -0,0 +1,164 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This script serves the sole purpose of converting skeleton-based graph
+# in MMAction2 to ONNX files. Please note that attempting to convert other
+# models using this script may not yield successful results.
+import argparse
+
+import numpy as np
+import onnxruntime
+import torch
+import torch.nn as nn
+from mmengine import Config
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+from mmengine.structures import LabelData
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Get model flops and params')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--num_frames', type=int, default=150, help='number of input frames.')
+    parser.add_argument(
+        '--num_person', type=int, default=2, help='number of maximum person.')
+    parser.add_argument(
+        '--num_joints',
+        type=int,
+        default=0,
+        help='number of joints. If not given, will use default settings from'
+        'the config file')
+    parser.add_argument(
+        '--device', type=str, default='cpu', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--output_file',
+        type=str,
+        default='stgcn.onnx',
+        help='file name of the output onnx file')
+    args = parser.parse_args()
+    return args
+
+
+class AvgPool2d(nn.Module):
+
+    def forward(self, x):
+        return x.mean(dim=(-1, -2), keepdims=True)
+
+
+class MaxPool2d(nn.Module):
+
+    def forward(self, x):
+        x = x.max(dim=-1, keepdim=True)[0]
+        x = x.max(dim=-2, keepdim=True)[0]
+        return x
+
+
+class GCNNet(nn.Module):
+
+    def __init__(self, base_model):
+        super(GCNNet, self).__init__()
+        self.backbone = base_model.backbone
+        self.head = base_model.cls_head
+
+        if hasattr(self.head, 'pool'):
+            pool = self.head.pool
+            if isinstance(pool, nn.AdaptiveAvgPool2d):
+                assert pool.output_size == 1
+                self.head.pool = AvgPool2d()
+            elif isinstance(pool, nn.AdaptiveMaxPool2d):
+                assert pool.output_size == 1
+                self.head.pool = MaxPool2d()
+
+    def forward(self, input_tensor):
+        feat = self.backbone(input_tensor)
+        cls_score = self.head(feat)
+        return cls_score
+
+
+def softmax(x):
+    x = np.exp(x - x.max())
+    return x / x.sum()
+
+
+def main():
+    args = parse_args()
+    config = Config.fromfile(args.config)
+    init_default_scope(config.get('default_scope', 'mmaction'))
+
+    if config.model.type != 'RecognizerGCN':
+        print(
+            'This script serves the sole purpose of converting skeleton-based '
+            'graph in MMAction2 to ONNX files. Please note that attempting to '
+            'convert other models using this script may not yield successful '
+            'results.\n\n')
+
+    base_model = MODELS.build(config.model)
+    load_checkpoint(base_model, args.checkpoint, map_location='cpu')
+    base_model.to(args.device)
+
+    lookup = {'openpose': 18, 'nturgb+d': 25, 'coco': 17}
+
+    num_joints = args.num_joints
+    num_person = args.num_person
+    num_frames = args.num_frames
+    if num_joints == 0:
+        layout = config.model.backbone.graph_cfg.layout
+        if layout not in lookup:
+            raise KeyError(
+                '`layout` not supported, please specify `num_joints`')
+        num_joints = lookup[layout]
+
+    input_tensor = torch.randn(1, num_person, num_frames, num_joints, 3)
+    input_tensor = input_tensor.clamp(-3, 3).to(args.device)
+
+    base_model.eval()
+
+    data_sample = ActionDataSample()
+    data_sample.pred_scores = LabelData()
+    data_sample.pred_labels = LabelData()
+    base_output = base_model(
+        input_tensor.unsqueeze(0), data_samples=[data_sample],
+        mode='predict')[0]
+    base_output = base_output.pred_score.detach().cpu().numpy()
+
+    model = GCNNet(base_model).to(args.device)
+    model.eval()
+
+    torch.onnx.export(
+        model, (input_tensor),
+        args.output_file,
+        input_names=['input_tensor'],
+        output_names=['cls_score'],
+        export_params=True,
+        do_constant_folding=True,
+        verbose=False,
+        opset_version=12,
+        dynamic_axes={
+            'input_tensor': {
+                0: 'batch_size',
+                1: 'num_person',
+                2: 'num_frames'
+            },
+            'cls_score': {
+                0: 'batch_size'
+            }
+        })
+
+    print(f'Successfully export the onnx file to {args.output_file}')
+
+    # Test exported file
+    session = onnxruntime.InferenceSession(args.output_file)
+    input_feed = {'input_tensor': input_tensor.cpu().data.numpy()}
+    outputs = session.run(['cls_score'], input_feed=input_feed)
+    output = softmax(outputs[0][0])
+
+    diff = abs(base_output - output).max()
+    if diff < 1e-5:
+        print('The output difference is smaller than 1e-5.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/deployment/export_onnx_posec3d.py b/tools/deployment/export_onnx_posec3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..c80c606e8036bfee55fb08417091d4080e7339aa
--- /dev/null
+++ b/tools/deployment/export_onnx_posec3d.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This script serves the sole purpose of converting PoseC3D skeleton models
+# in MMAction2 to ONNX files. Please note that attempting to convert other
+# models using this script may not yield successful results.
+import argparse
+
+import numpy as np
+import onnxruntime
+import torch
+import torch.nn as nn
+from mmengine import Config
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+from mmengine.structures import LabelData
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Get model flops and params')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--num_frames', type=int, default=48, help='number of input frames.')
+    parser.add_argument(
+        '--image_size', type=int, default=64, help='size of the frame')
+    parser.add_argument(
+        '--num_joints',
+        type=int,
+        default=0,
+        help='number of joints. If not given, will use default settings from'
+        'the config file')
+    parser.add_argument(
+        '--device', type=str, default='cpu', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--output_file',
+        type=str,
+        default='posec3d.onnx',
+        help='file name of the output onnx file')
+    args = parser.parse_args()
+    return args
+
+
+class AvgPool3d(nn.Module):
+
+    def forward(self, x):
+        return x.mean(dim=(-1, -2, -3), keepdims=True)
+
+
+class MaxPool3d(nn.Module):
+
+    def forward(self, x):
+        x = x.max(dim=-1, keepdim=True)[0]
+        x = x.max(dim=-2, keepdim=True)[0]
+        x = x.max(dim=-3, keepdim=True)[0]
+        return x
+
+
+class GCNNet(nn.Module):
+
+    def __init__(self, base_model):
+        super(GCNNet, self).__init__()
+        self.backbone = base_model.backbone
+        self.head = base_model.cls_head
+
+        if hasattr(self.head, 'pool'):
+            pool = self.head.pool
+            if isinstance(pool, nn.AdaptiveAvgPool3d):
+                assert pool.output_size == 1
+                self.head.pool = AvgPool3d()
+            elif isinstance(pool, nn.AdaptiveMaxPool3d):
+                assert pool.output_size == 1
+                self.head.pool = MaxPool3d()
+
+    def forward(self, input_tensor):
+        feat = self.backbone(input_tensor)
+        cls_score = self.head(feat)
+        return cls_score
+
+
+def softmax(x):
+    x = np.exp(x - x.max())
+    return x / x.sum()
+
+
+def main():
+    args = parse_args()
+    config = Config.fromfile(args.config)
+
+    if config.model.type != 'RecognizerGCN':
+        print('This script serves the sole purpose of converting PoseC3D '
+              'skeleton models in MMAction2 to ONNX files. Please note that '
+              'attempting to convert other models using this script may not '
+              'yield successful results.\n\n')
+
+    init_default_scope(config.get('default_scope', 'mmaction'))
+
+    base_model = MODELS.build(config.model)
+    load_checkpoint(base_model, args.checkpoint, map_location='cpu')
+    base_model.to(args.device)
+
+    num_joints = args.num_joints
+    image_size = args.image_size
+    num_frames = args.num_frames
+    if num_joints == 0:
+        num_joints = config.model.backbone.in_channels
+
+    input_tensor = torch.randn(1, num_joints, num_frames, image_size,
+                               image_size)
+    input_tensor = input_tensor.clamp(-3, 3).to(args.device)
+
+    base_model.eval()
+
+    data_sample = ActionDataSample()
+    data_sample.pred_scores = LabelData()
+    data_sample.pred_labels = LabelData()
+    base_output = base_model(
+        input_tensor.unsqueeze(0), data_samples=[data_sample],
+        mode='predict')[0]
+    base_output = base_output.pred_score.detach().cpu().numpy()
+
+    model = GCNNet(base_model).to(args.device)
+    model.eval()
+
+    torch.onnx.export(
+        model, (input_tensor),
+        args.output_file,
+        input_names=['input_tensor'],
+        output_names=['cls_score'],
+        export_params=True,
+        do_constant_folding=True,
+        verbose=False,
+        opset_version=11,
+        dynamic_axes={
+            'input_tensor': {
+                0: 'batch_size',
+                2: 'num_frames'
+            },
+            'cls_score': {
+                0: 'batch_size'
+            }
+        })
+
+    print(f'Successfully export the onnx file to {args.output_file}')
+
+    # Test exported file
+    session = onnxruntime.InferenceSession(args.output_file)
+    input_feed = {'input_tensor': input_tensor.cpu().data.numpy()}
+    outputs = session.run(['cls_score'], input_feed=input_feed)
+    output = softmax(outputs[0][0])
+
+    diff = abs(base_output - output).max()
+    if diff < 1e-5:
+        print('The output difference is smaller than 1e-5.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/deployment/export_onnx_stdet.py b/tools/deployment/export_onnx_stdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f03a2ee5afdd39e990343d2ef2ab786ddaadd12
--- /dev/null
+++ b/tools/deployment/export_onnx_stdet.py
@@ -0,0 +1,202 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This script serves the sole purpose of converting spatial-temporal detection
+# models supported in MMAction2 to ONNX files. Please note that attempting to
+# convert other models using this script may not yield successful results.
+import argparse
+
+import onnxruntime
+import torch
+import torch.nn as nn
+from mmdet.structures.bbox import bbox2roi
+from mmengine import Config
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+
+from mmaction.registry import MODELS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Get model flops and params')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--num_frames', type=int, default=8, help='number of input frames.')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[256, 455],
+        help='input image size')
+    parser.add_argument(
+        '--device', type=str, default='cpu', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--output_file',
+        type=str,
+        default='stdet.onnx',
+        help='file name of the output onnx file')
+    args = parser.parse_args()
+    return args
+
+
+class SpatialMaxPool3d(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x = x.max(dim=-1, keepdim=True)[0]
+        return x.max(dim=-2, keepdim=True)[0]
+
+
+class SpatialAvgPool(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.mean(dim=(-1, -2), keepdims=True)
+
+
+class TemporalMaxPool3d(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.max(dim=-3, keepdim=True)[0]
+
+
+class TemporalAvgPool3d(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.mean(dim=-3, keepdim=True)
+
+
+class GlobalPool2d(nn.Module):
+
+    def __init__(self, pool_size, output_size, later_max=True):
+        super().__init__()
+        self.pool = nn.AvgPool2d(pool_size)
+        self.max = later_max
+        self.output_size = output_size
+
+    def forward(self, x):
+        x = self.pool(x)
+        if self.max:
+            x = x.max(dim=-1, keepdim=True)[0]
+            x = x.max(dim=-2, keepdim=True)[0]
+        else:
+            x = x.mean(dim=(-1, -2), keepdims=True)
+        x = x.expand(-1, -1, self.output_size, self.output_size)
+        return x
+
+
+class STDet(nn.Module):
+
+    def __init__(self, base_model, input_tensor):
+        super(STDet, self).__init__()
+        self.backbone = base_model.backbone
+        self.bbox_roi_extractor = base_model.roi_head.bbox_roi_extractor
+        self.bbox_head = base_model.roi_head.bbox_head
+
+        output_size = self.bbox_roi_extractor.global_pool.output_size
+        pool_size = min(input_tensor.shape[-2:]) // 16 // output_size
+
+        if isinstance(self.bbox_head.temporal_pool, nn.AdaptiveAvgPool3d):
+            self.bbox_head.temporal_pool = TemporalAvgPool3d()
+        else:
+            self.bbox_head.temporal_pool = TemporalMaxPool3d()
+        if isinstance(self.bbox_head.spatial_pool, nn.AdaptiveAvgPool3d):
+            self.bbox_head.spatial_pool = SpatialAvgPool()
+            self.bbox_roi_extractor.global_pool = GlobalPool2d(
+                pool_size, output_size, later_max=False)
+        else:
+            self.bbox_head.spatial_pool = SpatialMaxPool3d()
+            self.bbox_roi_extractor.global_pool = GlobalPool2d(
+                pool_size, output_size, later_max=True)
+
+    def forward(self, input_tensor, rois):
+        feat = self.backbone(input_tensor)
+        bbox_feats, _ = self.bbox_roi_extractor(feat, rois)
+        cls_score = self.bbox_head(bbox_feats)
+        return cls_score
+
+
+def main():
+    args = parse_args()
+    config = Config.fromfile(args.config)
+
+    if config.model.type != 'FastRCNN':
+        print('This script serves the sole purpose of converting spatial '
+              'temporal detection models in MMAction2 to ONNX files. Please '
+              'note that attempting to convert other models using this script '
+              'may not yield successful results.\n\n')
+
+    init_default_scope(config.get('default_scope', 'mmaction'))
+
+    base_model = MODELS.build(config.model)
+    load_checkpoint(base_model, args.checkpoint, map_location='cpu')
+    base_model.to(args.device)
+
+    if len(args.shape) == 1:
+        input_shape = (args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    input_tensor = torch.randn(1, 3, args.num_frames, *input_shape)
+    input_tensor = input_tensor.clamp(-3, 3).to(args.device)
+    proposal = torch.Tensor([[22., 59., 67., 157.], [186., 73., 217., 159.],
+                             [407., 95., 431., 168.]])
+
+    rois = bbox2roi([proposal]).to(args.device)
+
+    model = STDet(base_model, input_tensor).to(args.device)
+    model.eval()
+    cls_score = model(input_tensor, rois)
+    print(f'Model output shape: {cls_score.shape}')
+
+    torch.onnx.export(
+        model, (input_tensor, rois),
+        args.output_file,
+        input_names=['input_tensor', 'rois'],
+        output_names=['cls_score'],
+        export_params=True,
+        do_constant_folding=True,
+        verbose=False,
+        opset_version=11,
+        dynamic_axes={
+            'input_tensor': {
+                0: 'batch_size',
+                3: 'height',
+                4: 'width'
+            },
+            'rois': {
+                0: 'total_num_bbox_for_the_batch'
+            },
+            'cls_score': {
+                0: 'total_num_bbox_for_the_batch'
+            }
+        })
+
+    print(f'Successfully export the onnx file to {args.output_file}')
+
+    # Test exported file
+    session = onnxruntime.InferenceSession(args.output_file)
+    input_feed = {
+        'input_tensor': input_tensor.cpu().data.numpy(),
+        'rois': rois.cpu().data.numpy()
+    }
+    outputs = session.run(['cls_score'], input_feed=input_feed)
+    outputs = outputs[0]
+    diff = abs(cls_score.cpu().data.numpy() - outputs).max()
+    if diff < 1e-5:
+        print('The output difference is smaller than 1e-5.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/deployment/mmaction2torchserve.py b/tools/deployment/mmaction2torchserve.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ef31ffb2dad38270e19ca45ae6028155cfb8bd
--- /dev/null
+++ b/tools/deployment/mmaction2torchserve.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import shutil
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from mmengine.config import Config
+from mmengine.utils import mkdir_or_exist
+
+try:
+    from model_archiver.model_packaging import package_model
+    from model_archiver.model_packaging_utils import ModelExportUtils
+except ImportError:
+    raise ImportError('`torch-model-archiver` is required.'
+                      'Try: pip install torch-model-archiver')
+
+
+def mmaction2torchserve(
+    config_file: str,
+    checkpoint_file: str,
+    output_folder: str,
+    model_name: str,
+    label_file: str,
+    model_version: str = '1.0',
+    force: bool = False,
+):
+    """Converts MMAction2 model (config + checkpoint) to TorchServe `.mar`.
+
+    Args:
+        config_file (str): In MMAction2 config format.
+        checkpoint_file (str): In MMAction2 checkpoint format.
+        output_folder (str): Folder where `{model_name}.mar` will be created.
+            The file created will be in TorchServe archive format.
+        label_file (str): A txt file which contains the action category names.
+        model_name (str | None): If not None, used for naming the
+            `{model_name}.mar` file that will be created under `output_folder`.
+            If None, `{Path(checkpoint_file).stem}` will be used.
+        model_version (str): Model's version.
+        force (bool): If True, if there is an existing `{model_name}.mar` file
+            under `output_folder` it will be overwritten.
+    """
+    mkdir_or_exist(output_folder)
+
+    config = Config.fromfile(config_file)
+
+    with TemporaryDirectory() as tmpdir:
+        config.dump(f'{tmpdir}/config.py')
+        shutil.copy(label_file, f'{tmpdir}/label_map.txt')
+
+        args = Namespace(
+            **{
+                'model_file': f'{tmpdir}/config.py',
+                'serialized_file': checkpoint_file,
+                'handler': f'{Path(__file__).parent}/mmaction_handler.py',
+                'model_name': model_name or Path(checkpoint_file).stem,
+                'version': model_version,
+                'export_path': output_folder,
+                'force': force,
+                'requirements_file': None,
+                'extra_files': f'{tmpdir}/label_map.txt',
+                'runtime': 'python',
+                'archive_format': 'default'
+            })
+        manifest = ModelExportUtils.generate_manifest_json(args)
+        package_model(args, manifest)
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description='Convert MMAction2 models to TorchServe `.mar` format.')
+    parser.add_argument('config', type=str, help='config file path')
+    parser.add_argument('checkpoint', type=str, help='checkpoint file path')
+    parser.add_argument(
+        '--output-folder',
+        type=str,
+        required=True,
+        help='Folder where `{model_name}.mar` will be created.')
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default=None,
+        help='If not None, used for naming the `{model_name}.mar`'
+        'file that will be created under `output_folder`.'
+        'If None, `{Path(checkpoint_file).stem}` will be used.')
+    parser.add_argument(
+        '--label-file',
+        type=str,
+        default=None,
+        help='A txt file which contains the action category names. ')
+    parser.add_argument(
+        '--model-version',
+        type=str,
+        default='1.0',
+        help='Number used for versioning.')
+    parser.add_argument(
+        '-f',
+        '--force',
+        action='store_true',
+        help='overwrite the existing `{model_name}.mar`')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    mmaction2torchserve(args.config, args.checkpoint, args.output_folder,
+                        args.model_name, args.label_file, args.model_version,
+                        args.force)
diff --git a/tools/deployment/mmaction_handler.py b/tools/deployment/mmaction_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fdd29df02c1c75494b35648a1e7379a1a2a038a
--- /dev/null
+++ b/tools/deployment/mmaction_handler.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import base64
+import os
+import os.path as osp
+import warnings
+
+import decord
+import numpy as np
+import torch
+
+from mmaction.apis import inference_recognizer, init_recognizer  # noqa: F401
+
+try:
+    from ts.torch_handler.base_handler import BaseHandler
+except ImportError:
+    raise ImportError('`ts` is required. Try: pip install ts.')
+
+
+class MMActionHandler(BaseHandler):
+
+    def initialize(self, context):
+        properties = context.system_properties
+        self.map_location = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = torch.device(self.map_location + ':' +
+                                   str(properties.get('gpu_id')) if torch.cuda.
+                                   is_available() else self.map_location)
+        self.manifest = context.manifest
+
+        model_dir = properties.get('model_dir')
+        serialized_file = self.manifest['model']['serializedFile']
+        checkpoint = os.path.join(model_dir, serialized_file)
+        self.config_file = os.path.join(model_dir, 'config.py')
+
+        mapping_file_path = osp.join(model_dir, 'label_map.txt')
+        if not os.path.isfile(mapping_file_path):
+            warnings.warn('Missing the label_map.txt file. '
+                          'Inference output will not include class name.')
+            self.mapping = None
+        else:
+            lines = open(mapping_file_path).readlines()
+            self.mapping = [x.strip() for x in lines]
+
+        self.model = init_recognizer(self.config_file, checkpoint, self.device)
+        self.initialized = True
+
+    def preprocess(self, data):
+        videos = []
+
+        for row in data:
+            video = row.get('data') or row.get('body')
+            if isinstance(video, str):
+                video = base64.b64decode(video)
+            # First save the bytes as a tmp file
+            with open('/tmp/tmp.mp4', 'wb') as fout:
+                fout.write(video)
+
+            video = decord.VideoReader('/tmp/tmp.mp4')
+            frames = [x.asnumpy() for x in video]
+            videos.append(np.stack(frames))
+
+        return videos
+
+    def inference(self, data, *args, **kwargs):
+        results = [inference_recognizer(self.model, item) for item in data]
+        return results
+
+    def postprocess(self, data):
+        # Format output following the example ObjectDetectionHandler format
+        output = []
+        for video_idx, video_result in enumerate(data):
+            output.append([])
+            assert isinstance(video_result, list)
+
+            output[video_idx] = {
+                self.mapping[x[0]] if self.mapping else x[0]: float(x[1])
+                for x in video_result
+            }
+
+        return output
diff --git a/tools/deployment/publish_model.py b/tools/deployment/publish_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ce1a78e3a805250a3c36343245e1c2304fd9413
--- /dev/null
+++ b/tools/deployment/publish_model.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import datetime
+import os
+import platform
+import subprocess
+
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove some unnecessary keys for smaller file size
+    unnecessary_keys = ['message_hub', 'optimizer', 'param_schedulers']
+    for k in unnecessary_keys:
+        if k in checkpoint:
+            del checkpoint[k]
+    unnecessary_params = ['data_preprocessor.mean', 'data_preprocessor.std']
+    for k in unnecessary_params:
+        if 'state_dict' in checkpoint and k in checkpoint['state_dict']:
+            del checkpoint['state_dict'][k]
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    if platform.system() == 'Windows':
+        sha = subprocess.check_output(
+            ['certutil', '-hashfile', out_file, 'SHA256'])
+        sha = str(sha).split('\\r\\n')[1]
+    else:
+        sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    if out_file.endswith('.pth'):
+        out_file_name = out_file[:-4]
+    else:
+        out_file_name = out_file
+
+    current_date = datetime.datetime.now().strftime('%Y%m%d')
+    final_file = out_file_name + f'_{current_date}-{sha[:8]}.pth'
+    os.rename(out_file, final_file)
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/dist_test.sh b/tools/dist_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..958b6691a026a4c30050dd41ec4a76576e9450f4
--- /dev/null
+++ b/tools/dist_test.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+set -x
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+# Arguments starting from the forth one are captured by ${@:4}
+python -m torch.distributed.launch --nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS --master_port=$PORT $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
diff --git a/tools/dist_train.sh b/tools/dist_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f2d9b609430b2dd49867b456bed50e83438323a9
--- /dev/null
+++ b/tools/dist_train.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+set -x
+
+CONFIG=$1
+GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS --master_port=$PORT $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
+# Any arguments from the third one are captured by ${@:3}
diff --git a/tools/misc/bsn_proposal_generation.py b/tools/misc/bsn_proposal_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..652ad8f7eedfcf753dea26066badfd8dab7c93e9
--- /dev/null
+++ b/tools/misc/bsn_proposal_generation.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+import mmengine
+import numpy as np
+import torch.multiprocessing as mp
+
+from mmaction.models.localizers.utils import (generate_bsp_feature,
+                                              generate_candidate_proposals)
+
+
+def load_video_infos(ann_file):
+    """Load the video annotations.
+
+    Args:
+        ann_file (str): A json file path of the annotation file.
+
+    Returns:
+        list[dict]: A list containing annotations for videos.
+    """
+    video_infos = []
+    anno_database = mmengine.load(ann_file)
+    for video_name in anno_database:
+        video_info = anno_database[video_name]
+        video_info['video_name'] = video_name
+        video_infos.append(video_info)
+    return video_infos
+
+
+def generate_proposals(ann_file, tem_results_dir, pgm_proposals_dir,
+                       pgm_proposals_thread, **kwargs):
+    """Generate proposals using multi-process.
+
+    Args:
+        ann_file (str): A json file path of the annotation file for
+            all videos to be processed.
+        tem_results_dir (str): Directory to read tem results
+        pgm_proposals_dir (str): Directory to save generated proposals.
+        pgm_proposals_thread (int): Total number of threads.
+        kwargs (dict): Keyword arguments for "generate_candidate_proposals".
+    """
+    video_infos = load_video_infos(ann_file)
+    num_videos = len(video_infos)
+    num_videos_per_thread = num_videos // pgm_proposals_thread
+    processes = []
+    manager = mp.Manager()
+    result_dict = manager.dict()
+    kwargs['result_dict'] = result_dict
+    for tid in range(pgm_proposals_thread - 1):
+        tmp_video_list = range(tid * num_videos_per_thread,
+                               (tid + 1) * num_videos_per_thread)
+        p = mp.Process(
+            target=generate_candidate_proposals,
+            args=(
+                tmp_video_list,
+                video_infos,
+                tem_results_dir,
+            ),
+            kwargs=kwargs)
+        p.start()
+        processes.append(p)
+
+    tmp_video_list = range((pgm_proposals_thread - 1) * num_videos_per_thread,
+                           num_videos)
+    p = mp.Process(
+        target=generate_candidate_proposals,
+        args=(
+            tmp_video_list,
+            video_infos,
+            tem_results_dir,
+        ),
+        kwargs=kwargs)
+    p.start()
+    processes.append(p)
+
+    for p in processes:
+        p.join()
+
+    # save results
+    os.makedirs(pgm_proposals_dir, exist_ok=True)
+    prog_bar = mmengine.ProgressBar(num_videos)
+    header = 'tmin,tmax,tmin_score,tmax_score,score,match_iou,match_ioa'
+    for video_name in result_dict:
+        proposals = result_dict[video_name]
+        proposal_path = osp.join(pgm_proposals_dir, video_name + '.csv')
+        np.savetxt(
+            proposal_path,
+            proposals,
+            header=header,
+            delimiter=',',
+            comments='')
+        prog_bar.update()
+
+
+def generate_features(ann_file, tem_results_dir, pgm_proposals_dir,
+                      pgm_features_dir, pgm_features_thread, **kwargs):
+    """Generate proposals features using multi-process.
+
+    Args:
+        ann_file (str): A json file path of the annotation file for
+            all videos to be processed.
+        tem_results_dir (str): Directory to read tem results.
+        pgm_proposals_dir (str): Directory to read generated proposals.
+        pgm_features_dir (str): Directory to save generated features.
+        pgm_features_thread (int): Total number of threads.
+        kwargs (dict): Keyword arguments for "generate_bsp_feature".
+    """
+    video_infos = load_video_infos(ann_file)
+    num_videos = len(video_infos)
+    num_videos_per_thread = num_videos // pgm_features_thread
+    processes = []
+    manager = mp.Manager()
+    feature_return_dict = manager.dict()
+    kwargs['result_dict'] = feature_return_dict
+    for tid in range(pgm_features_thread - 1):
+        tmp_video_list = range(tid * num_videos_per_thread,
+                               (tid + 1) * num_videos_per_thread)
+        p = mp.Process(
+            target=generate_bsp_feature,
+            args=(
+                tmp_video_list,
+                video_infos,
+                tem_results_dir,
+                pgm_proposals_dir,
+            ),
+            kwargs=kwargs)
+        p.start()
+        processes.append(p)
+    tmp_video_list = range((pgm_features_thread - 1) * num_videos_per_thread,
+                           num_videos)
+    p = mp.Process(
+        target=generate_bsp_feature,
+        args=(
+            tmp_video_list,
+            video_infos,
+            tem_results_dir,
+            pgm_proposals_dir,
+        ),
+        kwargs=kwargs)
+    p.start()
+    processes.append(p)
+
+    for p in processes:
+        p.join()
+
+    # save results
+    os.makedirs(pgm_features_dir, exist_ok=True)
+    prog_bar = mmengine.ProgressBar(num_videos)
+    for video_name in feature_return_dict.keys():
+        bsp_feature = feature_return_dict[video_name]
+        feature_path = osp.join(pgm_features_dir, video_name + '.npy')
+        np.save(feature_path, bsp_feature)
+        prog_bar.update()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Proposal generation module')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        '--mode',
+        choices=['train', 'test'],
+        default='test',
+        help='train or test')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    print('Begin Proposal Generation Module')
+    args = parse_args()
+    cfg = mmengine.Config.fromfile(args.config)
+    tem_results_dir = cfg.tem_results_dir
+    pgm_proposals_dir = cfg.pgm_proposals_dir
+    pgm_features_dir = cfg.pgm_features_dir
+    if args.mode == 'test':
+        generate_proposals(cfg.ann_file_val, tem_results_dir,
+                           pgm_proposals_dir, **cfg.pgm_proposals_cfg)
+        print('\nFinish proposal generation')
+        generate_features(cfg.ann_file_val, tem_results_dir, pgm_proposals_dir,
+                          pgm_features_dir, **cfg.pgm_features_test_cfg)
+        print('\nFinish feature generation')
+
+    elif args.mode == 'train':
+        generate_proposals(cfg.ann_file_train, tem_results_dir,
+                           pgm_proposals_dir, **cfg.pgm_proposals_cfg)
+        print('\nFinish proposal generation')
+        generate_features(cfg.ann_file_train, tem_results_dir,
+                          pgm_proposals_dir, pgm_features_dir,
+                          **cfg.pgm_features_train_cfg)
+        print('\nFinish feature generation')
+
+    print('Finish Proposal Generation Module')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/misc/clip_feature_extraction.py b/tools/misc/clip_feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8a0478f6fc5f6e1d9e17b00568c0eccb0125c93
--- /dev/null
+++ b/tools/misc/clip_feature_extraction.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+from mmengine import dump, list_from_file, load
+from mmengine.config import Config, DictAction
+from mmengine.runner import Runner
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMAction2 feature extraction')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('output_prefix', type=str, help='output prefix')
+    parser.add_argument(
+        '--video-list', type=str, default=None, help='video file list')
+    parser.add_argument(
+        '--video-root', type=str, default=None, help='video root directory')
+    parser.add_argument(
+        '--spatial-type',
+        type=str,
+        default='avg',
+        choices=['avg', 'max', 'keep'],
+        help='Pooling type in spatial dimension')
+    parser.add_argument(
+        '--temporal-type',
+        type=str,
+        default='avg',
+        choices=['avg', 'max', 'keep'],
+        help='Pooling type in temporal dimension')
+    parser.add_argument(
+        '--long-video-mode',
+        action='store_true',
+        help='Perform long video inference to get a feature list from a video')
+    parser.add_argument(
+        '--clip-interval',
+        type=int,
+        default=None,
+        help='Clip interval for Clip interval of adjacent center of sampled '
+        'clips, used for long video inference')
+    parser.add_argument(
+        '--frame-interval',
+        type=int,
+        default=None,
+        help='Temporal interval of adjacent sampled frames, used for long '
+        'video long video inference')
+    parser.add_argument(
+        '--multi-view',
+        action='store_true',
+        help='Perform multi view inference')
+    parser.add_argument(
+        '--dump-score',
+        action='store_true',
+        help='Dump predict scores rather than features')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def merge_args(cfg, args):
+    """Merge CLI arguments to config."""
+    test_pipeline = cfg.test_dataloader.dataset.pipeline
+    # -------------------- Feature Head --------------------
+    if not args.dump_score:
+        backbone_type2name = dict(
+            ResNet3dSlowFast='slowfast',
+            MobileNetV2TSM='tsm',
+            ResNetTSM='tsm',
+        )
+
+        if cfg.model.type == 'RecognizerGCN':
+            backbone_name = 'gcn'
+        else:
+            backbone_name = backbone_type2name.get(cfg.model.backbone.type)
+        num_segments = None
+        if backbone_name == 'tsm':
+            for idx, transform in enumerate(test_pipeline):
+                if transform.type == 'UntrimmedSampleFrames':
+                    clip_len = transform['clip_len']
+                    continue
+                elif transform.type == 'SampleFrames':
+                    clip_len = transform['num_clips']
+            num_segments = cfg.model.backbone.get('num_segments', 8)
+            assert num_segments == clip_len, \
+                f'num_segments and clip length must same for TSM, but got ' \
+                f'num_segments {num_segments} clip_len {clip_len}'
+            if cfg.model.test_cfg is not None:
+                max_testing_views = cfg.model.test_cfg.get(
+                    'max_testing_views', num_segments)
+                assert max_testing_views % num_segments == 0, \
+                    'tsm needs to infer with batchsize of multiple ' \
+                    'of num_segments.'
+
+        spatial_type = None if args.spatial_type == 'keep' else \
+            args.spatial_type
+        temporal_type = None if args.temporal_type == 'keep' else \
+            args.temporal_type
+        feature_head = dict(
+            type='FeatureHead',
+            spatial_type=spatial_type,
+            temporal_type=temporal_type,
+            backbone_name=backbone_name,
+            num_segments=num_segments)
+        cfg.model.cls_head = feature_head
+
+    # ---------------------- multiple view ----------------------
+    if not args.multi_view:
+        # average features among multiple views
+        cfg.model.cls_head['average_clips'] = 'score'
+        if cfg.model.type == 'Recognizer3D':
+            for idx, transform in enumerate(test_pipeline):
+                if transform.type == 'SampleFrames':
+                    test_pipeline[idx]['num_clips'] = 1
+        for idx, transform in enumerate(test_pipeline):
+            if transform.type == 'SampleFrames':
+                test_pipeline[idx]['twice_sample'] = False
+            # if transform.type in ['ThreeCrop', 'TenCrop']:
+            if transform.type == 'TenCrop':
+                test_pipeline[idx].type = 'CenterCrop'
+
+    # -------------------- pipeline settings  --------------------
+    # assign video list and video root
+    if args.video_list is not None:
+        cfg.test_dataloader.dataset.ann_file = args.video_list
+    if args.video_root is not None:
+        if cfg.test_dataloader.dataset.type == 'VideoDataset':
+            cfg.test_dataloader.dataset.data_prefix = dict(
+                video=args.video_root)
+        elif cfg.test_dataloader.dataset.type == 'RawframeDataset':
+            cfg.test_dataloader.dataset.data_prefix = dict(img=args.video_root)
+    args.video_list = cfg.test_dataloader.dataset.ann_file
+    args.video_root = cfg.test_dataloader.dataset.data_prefix
+    # use UntrimmedSampleFrames for long video inference
+    if args.long_video_mode:
+        # preserve features of multiple clips
+        cfg.model.cls_head['average_clips'] = None
+        cfg.test_dataloader.batch_size = 1
+        is_recognizer2d = (cfg.model.type == 'Recognizer2D')
+
+        frame_interval = args.frame_interval
+        for idx, transform in enumerate(test_pipeline):
+            if transform.type == 'UntrimmedSampleFrames':
+                clip_len = transform['clip_len']
+                continue
+            # replace SampleFrame by UntrimmedSampleFrames
+            elif transform.type in ['SampleFrames', 'UniformSample']:
+                assert args.clip_interval is not None, \
+                    'please specify clip interval for long video inference'
+                if is_recognizer2d:
+                    # clip_len of UntrimmedSampleFrames is same as
+                    # num_clips for 2D Recognizer.
+                    clip_len = transform['num_clips']
+                else:
+                    clip_len = transform['clip_len']
+                    if frame_interval is None:
+                        # take frame_interval of SampleFrames as default
+                        frame_interval = transform.get('frame_interval')
+                assert frame_interval is not None, \
+                    'please specify frame interval for long video ' \
+                    'inference when use UniformSample or 2D Recognizer'
+
+                sample_cfgs = dict(
+                    type='UntrimmedSampleFrames',
+                    clip_len=clip_len,
+                    clip_interval=args.clip_interval,
+                    frame_interval=frame_interval)
+                test_pipeline[idx] = sample_cfgs
+                continue
+        # flow input will stack all frames
+        if cfg.test_dataloader.dataset.get('modality') == 'Flow':
+            clip_len = 1
+
+        if is_recognizer2d:
+            from mmaction.models import ActionDataPreprocessor
+            from mmaction.registry import MODELS
+
+            @MODELS.register_module()
+            class LongVideoDataPreprocessor(ActionDataPreprocessor):
+                """DataPreprocessor for 2D recognizer to infer on long video.
+
+                Which would stack the num_clips to batch dimension, to preserve
+                feature of each clip (no average among clips)
+                """
+
+                def __init__(self, num_frames=8, **kwargs) -> None:
+                    super().__init__(**kwargs)
+                    self.num_frames = num_frames
+
+                def preprocess(self, inputs, data_samples, training=False):
+                    batch_inputs, data_samples = super().preprocess(
+                        inputs, data_samples, training)
+                    # [N*M, T, C, H, W]
+                    nclip_batch_inputs = batch_inputs.view(
+                        (-1, self.num_frames) + batch_inputs.shape[2:])
+                    # data_samples = data_samples * \
+                    #     nclip_batch_inputs.shape[0]
+                    return nclip_batch_inputs, data_samples
+
+            preprocessor_cfg = cfg.model.data_preprocessor
+            preprocessor_cfg.type = 'LongVideoDataPreprocessor'
+            preprocessor_cfg['num_frames'] = clip_len
+
+    # -------------------- Dump predictions --------------------
+    args.dump = osp.join(args.output_prefix, 'total_feats.pkl')
+    dump_metric = dict(type='DumpResults', out_file_path=args.dump)
+    cfg.test_evaluator = [dump_metric]
+    cfg.work_dir = osp.join(args.output_prefix, 'work_dir')
+
+    return cfg
+
+
+def split_feats(args):
+    total_feats = load(args.dump)
+    if args.dump_score:
+        total_feats = [sample['pred_scores']['item'] for sample in total_feats]
+
+    video_list = list_from_file(args.video_list)
+    video_list = [line.split(' ')[0] for line in video_list]
+
+    for video_name, feature in zip(video_list, total_feats):
+        dump(feature, osp.join(args.output_prefix, video_name + '.pkl'))
+    os.remove(args.dump)
+
+
+def main():
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    cfg = merge_args(cfg, args)
+    cfg.launcher = args.launcher
+
+    cfg.load_from = args.checkpoint
+
+    # build the runner from config
+    runner = Runner.from_cfg(cfg)
+
+    # start testing
+    runner.test()
+
+    split_feats(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/misc/dist_clip_feature_extraction.sh b/tools/misc/dist_clip_feature_extraction.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ebdf23cc090b15591a987b8161433381065978ef
--- /dev/null
+++ b/tools/misc/dist_clip_feature_extraction.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29500}
+
+PYTHONPATH="$(dirname $0)/../..":$PYTHONPATH \
+# Arguments starting from the forth one are captured by ${@:4}
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/clip_feature_extraction.py $CONFIG $CHECKPOINT \
+    --launcher pytorch ${@:4}
diff --git a/tools/misc/flow_extraction.py b/tools/misc/flow_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..84866c74385dee2f88c99cb454158878d9623428
--- /dev/null
+++ b/tools/misc/flow_extraction.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+import cv2
+import numpy as np
+
+
+def flow_to_img(raw_flow, bound=20.):
+    """Convert flow to gray image.
+
+    Args:
+        raw_flow (np.ndarray[float]): Estimated flow with the shape (w, h).
+        bound (float): Bound for the flow-to-image normalization. Default: 20.
+
+    Returns:
+        np.ndarray[uint8]: The result list of np.ndarray[uint8], with shape
+                        (w, h).
+    """
+    flow = np.clip(raw_flow, -bound, bound)
+    flow += bound
+    flow *= (255 / float(2 * bound))
+    flow = flow.astype(np.uint8)
+    return flow
+
+
+def generate_flow(frames, method='tvl1'):
+    """Estimate flow with given frames.
+
+    Args:
+        frames (list[np.ndarray[uint8]]): List of rgb frames, with shape
+                                        (w, h, 3).
+        method (str): Use which method to generate flow. Options are 'tvl1'
+                    and 'farneback'. Default: 'tvl1'.
+
+    Returns:
+        list[np.ndarray[float]]: The result list of np.ndarray[float], with
+                                shape (w, h, 2).
+    """
+    assert method in ['tvl1', 'farneback']
+    gray_frames = [cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) for frame in frames]
+
+    if method == 'tvl1':
+        tvl1 = cv2.optflow.DualTVL1OpticalFlow_create()
+
+        def op(x, y):
+            return tvl1.calc(x, y, None)
+    elif method == 'farneback':
+
+        def op(x, y):
+            return cv2.calcOpticalFlowFarneback(x, y, None, 0.5, 3, 15, 3, 5,
+                                                1.2, 0)
+
+    gray_st = gray_frames[:-1]
+    gray_ed = gray_frames[1:]
+
+    flow = [op(x, y) for x, y in zip(gray_st, gray_ed)]
+    return flow
+
+
+def extract_dense_flow(path,
+                       dest,
+                       bound=20.,
+                       save_rgb=False,
+                       start_idx=0,
+                       rgb_tmpl='img_{:05d}.jpg',
+                       flow_tmpl='{}_{:05d}.jpg',
+                       method='tvl1'):
+    """Extract dense flow given video or frames, save them as gray-scale
+    images.
+
+    Args:
+        path (str): Location of the input video.
+        dest (str): The directory to store the extracted flow images.
+        bound (float): Bound for the flow-to-image normalization. Default: 20.
+        save_rgb (bool): Save extracted RGB frames. Default: False.
+        start_idx (int): The starting frame index if use frames as input, the
+            first image is path.format(start_idx). Default: 0.
+        rgb_tmpl (str): The template of RGB frame names, Default:
+            'img_{:05d}.jpg'.
+        flow_tmpl (str): The template of Flow frame names, Default:
+            '{}_{:05d}.jpg'.
+        method (str): Use which method to generate flow. Options are 'tvl1'
+            and 'farneback'. Default: 'tvl1'.
+    """
+
+    frames = []
+    assert osp.exists(path)
+    video = cv2.VideoCapture(path)
+    flag, f = video.read()
+    while flag:
+        frames.append(f)
+        flag, f = video.read()
+
+    flow = generate_flow(frames, method=method)
+
+    flow_x = [flow_to_img(x[:, :, 0], bound) for x in flow]
+    flow_y = [flow_to_img(x[:, :, 1], bound) for x in flow]
+
+    if not osp.exists(dest):
+        os.system('mkdir -p ' + dest)
+    flow_x_names = [
+        osp.join(dest, flow_tmpl.format('x', ind + start_idx))
+        for ind in range(len(flow_x))
+    ]
+    flow_y_names = [
+        osp.join(dest, flow_tmpl.format('y', ind + start_idx))
+        for ind in range(len(flow_y))
+    ]
+
+    num_frames = len(flow)
+    for i in range(num_frames):
+        cv2.imwrite(flow_x_names[i], flow_x[i])
+        cv2.imwrite(flow_y_names[i], flow_y[i])
+
+    if save_rgb:
+        img_names = [
+            osp.join(dest, rgb_tmpl.format(ind + start_idx))
+            for ind in range(len(frames))
+        ]
+        for frame, name in zip(frames, img_names):
+            cv2.imwrite(name, frame)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Extract flow and RGB images')
+    parser.add_argument(
+        '--input',
+        help='videos for frame extraction, can be'
+        'single video or a video list, the video list should be a txt file '
+        'and just consists of filenames without directories')
+    parser.add_argument(
+        '--prefix',
+        default='',
+        help='the prefix of input '
+        'videos, used when input is a video list')
+    parser.add_argument(
+        '--dest',
+        default='',
+        help='the destination to save '
+        'extracted frames')
+    parser.add_argument(
+        '--save-rgb', action='store_true', help='also save '
+        'rgb frames')
+    parser.add_argument(
+        '--rgb-tmpl',
+        default='img_{:05d}.jpg',
+        help='template filename of rgb frames')
+    parser.add_argument(
+        '--flow-tmpl',
+        default='{}_{:05d}.jpg',
+        help='template filename of flow frames')
+    parser.add_argument(
+        '--start-idx',
+        type=int,
+        default=1,
+        help='the start '
+        'index of extracted frames')
+    parser.add_argument(
+        '--method',
+        default='tvl1',
+        help='use which method to '
+        'generate flow')
+    parser.add_argument(
+        '--bound', type=float, default=20, help='maximum of '
+        'optical flow')
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    if args.input.endswith('.txt'):
+        lines = open(args.input).readlines()
+        lines = [x.strip() for x in lines]
+        videos = [osp.join(args.prefix, x) for x in lines]
+        dests = [osp.join(args.dest, x.split('.')[0]) for x in lines]
+        for video, dest in zip(videos, dests):
+            extract_dense_flow(video, dest, args.bound, args.save_rgb,
+                               args.start_idx, args.rgb_tmpl, args.flow_tmpl,
+                               args.method)
+    else:
+        extract_dense_flow(args.input, args.dest, args.bound, args.save_rgb,
+                           args.start_idx, args.rgb_tmpl, args.flow_tmpl,
+                           args.method)
diff --git a/tools/slurm_test.sh b/tools/slurm_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b8c515333af2fef573854fca82b739407ea3de85
--- /dev/null
+++ b/tools/slurm_test.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+CHECKPOINT=$4
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+PY_ARGS=${@:5}  # Arguments starting from the fifth one are captured
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
diff --git a/tools/slurm_train.sh b/tools/slurm_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cdcebfe09fa864020ace105cfdec54584e38c7ff
--- /dev/null
+++ b/tools/slurm_train.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+export MASTER_PORT=$((12000 + $RANDOM % 20000))
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PY_ARGS=${@:4}  # Any arguments from the forth one are captured by this
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py ${CONFIG} --launcher="slurm" ${PY_ARGS}
diff --git a/tools/test.py b/tools/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..84f9d71f7611f73c869919cb0d06b207f64d0f05
--- /dev/null
+++ b/tools/test.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.runner import Runner
+
+from mmaction.registry import RUNNERS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMAction2 test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument(
+        '--dump',
+        type=str,
+        help='dump predictions to a pickle file for offline evaluation')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--show-dir',
+        help='directory where the visualization images will be saved.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='whether to display the prediction results in a window.')
+    parser.add_argument(
+        '--interval',
+        type=int,
+        default=1,
+        help='visualize per interval samples.')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=2,
+        help='display time of every window. (second)')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def merge_args(cfg, args):
+    """Merge CLI arguments to config."""
+    # -------------------- visualization --------------------
+    if args.show or (args.show_dir is not None):
+        assert 'visualization' in cfg.default_hooks, \
+            'VisualizationHook is not set in the `default_hooks` field of ' \
+            'config. Please set `visualization=dict(type="VisualizationHook")`'
+
+        cfg.default_hooks.visualization.enable = True
+        cfg.default_hooks.visualization.show = args.show
+        cfg.default_hooks.visualization.wait_time = args.wait_time
+        cfg.default_hooks.visualization.out_dir = args.show_dir
+        cfg.default_hooks.visualization.interval = args.interval
+
+    # -------------------- Dump predictions --------------------
+    if args.dump is not None:
+        assert args.dump.endswith(('.pkl', '.pickle')), \
+            'The dump file must be a pkl file.'
+        dump_metric = dict(type='DumpResults', out_file_path=args.dump)
+        if isinstance(cfg.test_evaluator, (list, tuple)):
+            cfg.test_evaluator = list(cfg.test_evaluator)
+            cfg.test_evaluator.append(dump_metric)
+        else:
+            cfg.test_evaluator = [cfg.test_evaluator, dump_metric]
+
+    return cfg
+
+
+def main():
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    cfg = merge_args(cfg, args)
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # start testing
+    runner.test()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/train.py b/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d91268b8cf6d4352bafe695e9356907a2d063d4
--- /dev/null
+++ b/tools/train.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.runner import Runner
+
+from mmaction.registry import RUNNERS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a action recognizer')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume',
+        nargs='?',
+        type=str,
+        const='auto',
+        help='If specify checkpoint path, resume from it, while if not '
+        'specify, try to auto resume from the latest checkpoint '
+        'in the work directory.')
+    parser.add_argument(
+        '--amp',
+        action='store_true',
+        help='enable automatic-mixed-precision training')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    parser.add_argument(
+        '--auto-scale-lr',
+        action='store_true',
+        help='whether to auto scale the learning rate according to the '
+        'actual batch size and the original batch size.')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--diff-rank-seed',
+        action='store_true',
+        help='whether or not set different seeds for different ranks')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def merge_args(cfg, args):
+    """Merge CLI arguments to config."""
+    if args.no_validate:
+        cfg.val_cfg = None
+        cfg.val_dataloader = None
+        cfg.val_evaluator = None
+
+    cfg.launcher = args.launcher
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    # enable automatic-mixed-precision training
+    if args.amp is True:
+        optim_wrapper = cfg.optim_wrapper.get('type', 'OptimWrapper')
+        assert optim_wrapper in ['OptimWrapper', 'AmpOptimWrapper'], \
+            '`--amp` is not supported custom optimizer wrapper type ' \
+            f'`{optim_wrapper}.'
+        cfg.optim_wrapper.type = 'AmpOptimWrapper'
+        cfg.optim_wrapper.setdefault('loss_scale', 'dynamic')
+
+    # resume training
+    if args.resume == 'auto':
+        cfg.resume = True
+        cfg.load_from = None
+    elif args.resume is not None:
+        cfg.resume = True
+        cfg.load_from = args.resume
+
+    # enable auto scale learning rate
+    if args.auto_scale_lr:
+        cfg.auto_scale_lr.enable = True
+
+    # set random seeds
+    if cfg.get('randomness', None) is None:
+        cfg.randomness = dict(
+            seed=args.seed,
+            diff_rank_seed=args.diff_rank_seed,
+            deterministic=args.deterministic)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    return cfg
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    # merge cli arguments to config
+    cfg = merge_args(cfg, args)
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # start training
+    runner.train()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/visualizations/browse_dataset.py b/tools/visualizations/browse_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ba694c583580e991ab8468a84df5a226f73384b
--- /dev/null
+++ b/tools/visualizations/browse_dataset.py
@@ -0,0 +1,233 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import sys
+import warnings
+from copy import deepcopy
+
+import cv2
+import mmcv
+import numpy as np
+from mmengine.config import Config, DictAction
+from mmengine.dataset import Compose
+from mmengine.registry import init_default_scope
+from mmengine.utils import ProgressBar
+from mmengine.visualization import Visualizer
+
+from mmaction.registry import DATASETS
+from mmaction.visualization import ActionVisualizer
+from mmaction.visualization.action_visualizer import _get_adaptive_scale
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        'output_dir', default=None, type=str, help='output directory')
+    parser.add_argument('--label', default=None, type=str, help='label file')
+    parser.add_argument(
+        '--phase',
+        '-p',
+        default='train',
+        type=str,
+        choices=['train', 'test', 'val'],
+        help='phase of dataset to visualize, accept "train" "test" and "val".'
+        ' Defaults to "train".')
+    parser.add_argument(
+        '--show-number',
+        '-n',
+        type=int,
+        default=sys.maxsize,
+        help='number of images selected to visualize, must bigger than 0. if '
+        'the number is bigger than length of dataset, show all the images in '
+        'dataset; default "sys.maxsize", show all images in dataset')
+    parser.add_argument(
+        '--fps',
+        default=5,
+        type=int,
+        help='specify fps value of the output video when using rawframes to '
+        'generate file')
+    parser.add_argument(
+        '--mode',
+        '-m',
+        default='transformed',
+        type=str,
+        choices=['original', 'transformed', 'concat', 'pipeline'],
+        help='display mode; display original pictures or transformed pictures'
+        ' or comparison pictures. "original" means show images load from disk'
+        '; "transformed" means to show images after transformed; "concat" '
+        'means show images stitched by "original" and "output" images. '
+        '"pipeline" means show all the intermediate images. '
+        'Defaults to "transformed".')
+    parser.add_argument(
+        '--rescale-factor',
+        '-r',
+        type=float,
+        help='video rescale factor, which is useful if the output is too '
+        'large or too small.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def make_grid(videos, names, rescale_factor=None):
+    """Concat list of pictures into a single big picture, align height here."""
+    vis = Visualizer()
+
+    ori_shapes = [vid[0].shape[:2] for vid in videos]
+    if rescale_factor is not None:
+        videos = [[mmcv.imrescale(img, rescale_factor) for img in video]
+                  for video in videos]
+
+    max_height = int(max(vid[0].shape[0] for vid in videos) * 1.4)
+    min_width = min(vid[0].shape[1] for vid in videos)
+    horizontal_gap = min_width // 10
+    img_scale = _get_adaptive_scale((max_height, min_width))
+
+    texts = []
+    text_positions = []
+    start_x = 0
+    for i, vid in enumerate(videos):
+        for j, img in enumerate(vid):
+            pad_height = (max_height - img.shape[0]) // 2
+            pad_width = horizontal_gap // 2
+            # make border
+            videos[i][j] = cv2.copyMakeBorder(
+                img,
+                pad_height,
+                max_height - img.shape[0] - pad_height +
+                int(img_scale * 30 * 2),
+                pad_width,
+                pad_width,
+                cv2.BORDER_CONSTANT,
+                value=(255, 255, 255))
+
+        texts.append(f'{names[i]}\n{ori_shapes[i]}')
+        text_positions.append(
+            [start_x + img.shape[1] // 2 + pad_width, max_height])
+        start_x += img.shape[1] + horizontal_gap
+
+    out_frames = []
+    for i in range(len(videos[0])):
+        imgs = [vid[i] for vid in videos]
+        display_img = np.concatenate(imgs, axis=1)
+        vis.set_image(display_img)
+        img_scale = _get_adaptive_scale(display_img.shape[:2])
+        vis.draw_texts(
+            texts,
+            positions=np.array(text_positions),
+            font_sizes=img_scale * 7,
+            colors='black',
+            horizontal_alignments='center',
+            font_families='monospace')
+        out_frames.append(vis.get_image())
+    return out_frames
+
+
+class InspectCompose(Compose):
+    """Compose multiple transforms sequentially.
+
+    And record "imgs" field of all results in one list.
+    """
+
+    def __init__(self, transforms, intermediate_imgs):
+        super().__init__(transforms=transforms)
+        self.intermediate_imgs = intermediate_imgs
+
+    def __call__(self, data):
+
+        for idx, t in enumerate(self.transforms):
+            data = t(data)
+            if data is None:
+                return None
+            if 'imgs' in data:
+                name = t.__class__.__name__
+                imgs = deepcopy(data['imgs'])
+                if name == 'FormatShape':
+                    continue
+                if name == 'ThreeCrop':
+                    n_crops = 3
+                    clip_len = len(imgs) // n_crops
+                    crop_imgs = [
+                        imgs[idx * clip_len:(idx + 1) * clip_len]
+                        for idx in range(n_crops)
+                    ]
+                    imgs = np.concatenate(crop_imgs, axis=1)
+                    imgs = [img for img in imgs]
+                if name == 'TenCrop':
+                    warnings.warn(
+                        'TenCrop is not supported, only show one crop')
+                self.intermediate_imgs.append({'name': name, 'imgs': imgs})
+        return data
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    init_default_scope(cfg.get('default_scope', 'mmaction'))
+
+    dataset_cfg = cfg.get(args.phase + '_dataloader').get('dataset')
+    dataset = DATASETS.build(dataset_cfg)
+
+    intermediate_imgs = []
+    dataset.pipeline = InspectCompose(dataset.pipeline.transforms,
+                                      intermediate_imgs)
+
+    # init visualizer
+    vis_backends = [dict(
+        type='LocalVisBackend',
+        save_dir=args.output_dir,
+    )]
+    visualizer = ActionVisualizer(
+        vis_backends=vis_backends, save_dir='place_holder')
+
+    if args.label:
+        labels = open(args.label).readlines()
+        labels = [x.strip() for x in labels]
+        visualizer.dataset_meta = dict(classes=labels)
+
+    # init visualization video number
+    display_number = min(args.show_number, len(dataset))
+    progress_bar = ProgressBar(display_number)
+
+    for i, item in zip(range(display_number), dataset):
+        rescale_factor = args.rescale_factor
+        if args.mode == 'original':
+            video = intermediate_imgs[0]['imgs']
+        elif args.mode == 'transformed':
+            video = intermediate_imgs[-1]['imgs']
+        elif args.mode == 'concat':
+            ori_video = intermediate_imgs[0]['imgs']
+            trans_video = intermediate_imgs[-1]['imgs']
+            video = make_grid([ori_video, trans_video],
+                              ['original', 'transformed'], rescale_factor)
+            rescale_factor = None
+        else:
+            video = make_grid([result['imgs'] for result in intermediate_imgs],
+                              [result['name'] for result in intermediate_imgs],
+                              rescale_factor)
+            rescale_factor = None
+
+        intermediate_imgs.clear()
+
+        data_sample = item['data_samples'].numpy()
+
+        file_id = f'video_{i}'
+        video = [x[..., ::-1] for x in video]
+        visualizer.add_datasample(
+            file_id, video, data_sample, fps=args.fps, out_type='video')
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/visualizations/vis_cam.py b/tools/visualizations/vis_cam.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2ed917b09c198f18b317312d288bd5cdf2c60fa
--- /dev/null
+++ b/tools/visualizations/vis_cam.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+from typing import Dict, List, Optional, Tuple
+
+import mmcv
+import numpy as np
+import torch.nn as nn
+from mmengine import Config, DictAction
+from mmengine.dataset import Compose, pseudo_collate
+
+from mmaction.apis import init_recognizer
+from mmaction.utils import GradCAM
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMAction2 GradCAM Visualization')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file/url')
+    parser.add_argument('video', help='video file/url or rawframes directory')
+    parser.add_argument(
+        '--use-frames',
+        default=False,
+        action='store_true',
+        help='whether to use rawframes as input')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--target-layer-name',
+        type=str,
+        default='backbone/layer4/1/relu',
+        help='GradCAM target layer name')
+    parser.add_argument('--out-filename', default=None, help='output filename')
+    parser.add_argument('--fps', default=5, type=int)
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--target-resolution',
+        nargs=2,
+        default=None,
+        type=int,
+        help='Target resolution (w, h) for resizing the frames when using a '
+        'video as input. If either dimension is set to -1, the frames are '
+        'resized by keeping the existing aspect ratio')
+    parser.add_argument(
+        '--resize-algorithm',
+        default='bilinear',
+        help='resize algorithm applied to generate video & gif')
+
+    args = parser.parse_args()
+    return args
+
+
+def build_inputs(model: nn.Module,
+                 video_path: str,
+                 use_frames: bool = False) -> Dict:
+    """build inputs for GradCAM.
+
+    Note that, building inputs for GradCAM is exactly the same as building
+    inputs for Recognizer test stage. Codes from `inference_recognizer`.
+
+    Args:
+        model (nn.Module): Recognizer model.
+        video_path (str): video file/url or rawframes directory.
+        use_frames (bool): whether to use rawframes as input.
+            Defaults to False.
+
+    Returns:
+        dict: Both GradCAM inputs and Recognizer test stage inputs,
+        including two keys, ``inputs`` and ``data_samples``.
+    """
+    if not (osp.exists(video_path) or video_path.startswith('http')):
+        raise RuntimeError(f"'{video_path}' is missing")
+
+    if osp.isfile(video_path) and use_frames:
+        raise RuntimeError(
+            f"'{video_path}' is a video file, not a rawframe directory")
+    if osp.isdir(video_path) and not use_frames:
+        raise RuntimeError(
+            f"'{video_path}' is a rawframe directory, not a video file")
+
+    cfg = model.cfg
+
+    # build the data pipeline
+    test_pipeline = cfg.test_pipeline
+    test_pipeline = Compose(test_pipeline)
+    # prepare data
+    if use_frames:
+        filename_tmpl = cfg.test_dataloader.dataset.get(
+            'filename_tmpl', 'img_{:05}.jpg')
+        start_index = cfg.test_dataloader.dataset.get('start_index', 1)
+        data = dict(
+            frame_dir=video_path,
+            total_frames=len(os.listdir(video_path)),
+            label=-1,
+            start_index=start_index,
+            filename_tmpl=filename_tmpl,
+            modality='RGB')
+    else:
+        start_index = cfg.test_dataloader.dataset.get('start_index', 0)
+        data = dict(
+            filename=video_path,
+            label=-1,
+            start_index=start_index,
+            modality='RGB')
+    data = test_pipeline(data)
+    data = pseudo_collate([data])
+
+    return data
+
+
+def _resize_frames(frame_list: List[np.ndarray],
+                   scale: Optional[Tuple[int]] = None,
+                   keep_ratio: bool = True,
+                   interpolation: str = 'bilinear') -> List[np.ndarray]:
+    """Resize frames according to given scale.
+
+    Codes are modified from `mmaction/datasets/transforms/processing.py`,
+    `Resize` class.
+
+    Args:
+        frame_list (list[np.ndarray]): Frames to be resized.
+        scale (tuple[int]): If keep_ratio is True, it serves as scaling
+            factor or maximum size: the image will be rescaled as large
+            as possible within the scale. Otherwise, it serves as (w, h)
+            of output size.
+        keep_ratio (bool): If set to True, Images will be resized without
+            changing the aspect ratio. Otherwise, it will resize images to a
+            given size. Defaults to True.
+        interpolation (str): Algorithm used for interpolation:
+            'nearest' | 'bilinear'. Defaults to ``'bilinear'``.
+
+    Returns:
+        list[np.ndarray]: Resized frames.
+    """
+    if scale is None or (scale[0] == -1 and scale[1] == -1):
+        return frame_list
+    scale = tuple(scale)
+    max_long_edge = max(scale)
+    max_short_edge = min(scale)
+    if max_short_edge == -1:
+        scale = (np.inf, max_long_edge)
+
+    img_h, img_w, _ = frame_list[0].shape
+
+    if keep_ratio:
+        new_w, new_h = mmcv.rescale_size((img_w, img_h), scale)
+    else:
+        new_w, new_h = scale
+
+    frame_list = [
+        mmcv.imresize(img, (new_w, new_h), interpolation=interpolation)
+        for img in frame_list
+    ]
+
+    return frame_list
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    cfg.merge_from_dict(args.cfg_options)
+
+    # Build the recognizer from a config file and checkpoint file/url
+    model = init_recognizer(cfg, args.checkpoint, device=args.device)
+
+    inputs = build_inputs(model, args.video, use_frames=args.use_frames)
+    gradcam = GradCAM(model, args.target_layer_name)
+    results = gradcam(inputs)
+
+    if args.out_filename is not None:
+        try:
+            from moviepy.editor import ImageSequenceClip
+        except ImportError:
+            raise ImportError('Please install moviepy to enable output file.')
+
+        # frames_batches shape [B, T, H, W, 3], in RGB order
+        frames_batches = (results[0] * 255.).numpy().astype(np.uint8)
+        frames = frames_batches.reshape(-1, *frames_batches.shape[-3:])
+
+        frame_list = list(frames)
+        frame_list = _resize_frames(
+            frame_list,
+            args.target_resolution,
+            interpolation=args.resize_algorithm)
+
+        video_clips = ImageSequenceClip(frame_list, fps=args.fps)
+        out_type = osp.splitext(args.out_filename)[1][1:]
+        if out_type == 'gif':
+            video_clips.write_gif(args.out_filename)
+        else:
+            video_clips.write_videofile(args.out_filename, remove_temp=True)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/visualizations/vis_scheduler.py b/tools/visualizations/vis_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..990f5c70704aaf8643ce8593689543140a0a89f5
--- /dev/null
+++ b/tools/visualizations/vis_scheduler.py
@@ -0,0 +1,275 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import os.path as osp
+import re
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import matplotlib.pyplot as plt
+import rich
+import torch.nn as nn
+from mmengine.config import Config, DictAction
+from mmengine.hooks import Hook
+from mmengine.model import BaseModel
+from mmengine.registry import init_default_scope
+from mmengine.runner import Runner
+from mmengine.visualization import Visualizer
+from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn
+from torch.utils.data import DataLoader
+
+from mmaction.utils import get_str_type
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Visualize a Dataset Pipeline')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '-p',
+        '--parameter',
+        type=str,
+        default='lr',
+        choices=['lr', 'momentum'],
+        help='The parameter to visualize its change curve, choose from'
+        '"lr" and "momentum". Defaults to "lr".')
+    parser.add_argument(
+        '-d',
+        '--dataset-size',
+        type=int,
+        help='The size of the dataset. If specify, `build_dataset` will '
+        'be skipped and use this size as the dataset size.')
+    parser.add_argument(
+        '-n',
+        '--ngpus',
+        type=int,
+        default=1,
+        help='The number of GPUs used in training.')
+    parser.add_argument(
+        '-s',
+        '--save-path',
+        type=Path,
+        help='The learning rate curve plot save path')
+    parser.add_argument(
+        '--log-level',
+        default='WARNING',
+        help='The log level of the handler and logger. Defaults to '
+        'WARNING.')
+    parser.add_argument('--title', type=str, help='title of figure')
+    parser.add_argument(
+        '--style', type=str, default='whitegrid', help='style of plt')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument(
+        '--window-size',
+        default='12*7',
+        help='Size of the window to display images, in format of "$W*$H".')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    if args.window_size != '':
+        assert re.match(r'\d+\*\d+', args.window_size), \
+            "'window-size' must be in format 'W*H'."
+
+    return args
+
+
+class SimpleModel(BaseModel):
+    """simple model that do nothing in train_step."""
+
+    def __init__(self):
+        super(SimpleModel, self).__init__()
+        self.data_preprocessor = nn.Identity()
+        self.conv = nn.Conv2d(1, 1, 1)
+
+    def forward(self, inputs, data_samples, mode='tensor'):
+        pass
+
+    def train_step(self, data, optim_wrapper):
+        pass
+
+
+class ParamRecordHook(Hook):
+
+    def __init__(self, by_epoch):
+        super().__init__()
+        self.by_epoch = by_epoch
+        self.lr_list = []
+        self.momentum_list = []
+        self.task_id = 0
+        self.progress = Progress(BarColumn(), MofNCompleteColumn(),
+                                 TextColumn('{task.description}'))
+
+    def before_train(self, runner):
+        if self.by_epoch:
+            total = runner.train_loop.max_epochs
+            self.task_id = self.progress.add_task(
+                'epochs', start=True, total=total)
+        else:
+            total = runner.train_loop.max_iters
+            self.task_id = self.progress.add_task(
+                'iters', start=True, total=total)
+        self.progress.start()
+
+    def after_train_epoch(self, runner):
+        if self.by_epoch:
+            self.progress.update(self.task_id, advance=1)
+
+    def after_train_iter(self, runner, batch_idx, data_batch, outputs):
+        if not self.by_epoch:
+            self.progress.update(self.task_id, advance=1)
+        self.lr_list.append(runner.optim_wrapper.get_lr()['lr'][0])
+        self.momentum_list.append(
+            runner.optim_wrapper.get_momentum()['momentum'][0])
+
+    def after_train(self, runner):
+        self.progress.stop()
+
+
+def plot_curve(lr_list, args, param_name, iters_per_epoch, by_epoch=True):
+    """Plot learning rate vs iter graph."""
+    try:
+        import seaborn as sns
+        sns.set_style(args.style)
+    except ImportError:
+        pass
+
+    wind_w, wind_h = args.window_size.split('*')
+    wind_w, wind_h = int(wind_w), int(wind_h)
+    plt.figure(figsize=(wind_w, wind_h))
+
+    ax: plt.Axes = plt.subplot()
+    ax.plot(lr_list, linewidth=1)
+
+    if by_epoch:
+        ax.xaxis.tick_top()
+        ax.set_xlabel('Iters')
+        ax.xaxis.set_label_position('top')
+        sec_ax = ax.secondary_xaxis(
+            'bottom',
+            functions=(lambda x: x / iters_per_epoch,
+                       lambda y: y * iters_per_epoch))
+        sec_ax.set_xlabel('Epochs')
+    else:
+        plt.xlabel('Iters')
+    plt.ylabel(param_name)
+
+    if args.title is None:
+        plt.title(f'{osp.basename(args.config)} {param_name} curve')
+    else:
+        plt.title(args.title)
+
+
+def simulate_train(data_loader, cfg, by_epoch):
+    model = SimpleModel()
+    param_record_hook = ParamRecordHook(by_epoch=by_epoch)
+    default_hooks = dict(
+        param_scheduler=cfg.default_hooks['param_scheduler'],
+        runtime_info=None,
+        timer=None,
+        logger=None,
+        checkpoint=None,
+        sampler_seed=None,
+        param_record=param_record_hook)
+
+    runner = Runner(
+        model=model,
+        work_dir=cfg.work_dir,
+        train_dataloader=data_loader,
+        train_cfg=cfg.train_cfg,
+        log_level=cfg.log_level,
+        optim_wrapper=cfg.optim_wrapper,
+        param_scheduler=cfg.param_scheduler,
+        default_scope=cfg.default_scope,
+        default_hooks=default_hooks,
+        auto_scale_lr=cfg.get('auto_scale_lr'),
+        visualizer=MagicMock(spec=Visualizer),
+        custom_hooks=cfg.get('custom_hooks', None))
+
+    runner.train()
+
+    return param_record_hook.lr_list, param_record_hook.momentum_list
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    if cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.log_level = args.log_level
+    init_default_scope(cfg.get('default_scope', 'mmaction'))
+
+    # make sure save_root exists
+    if args.save_path and not args.save_path.parent.exists():
+        raise FileNotFoundError(
+            f'The save path is {args.save_path}, and directory '
+            f"'{args.save_path.parent}' do not exist.")
+
+    # init logger
+    print('Param_scheduler :')
+    rich.print_json(json.dumps(cfg.param_scheduler))
+
+    # prepare data loader
+    batch_size = cfg.train_dataloader.batch_size * args.ngpus
+
+    if 'by_epoch' in cfg.train_cfg:
+        by_epoch = cfg.train_cfg.get('by_epoch')
+    elif 'type' in cfg.train_cfg:
+        by_epoch = get_str_type(cfg.train_cfg.get('by_epoch')) \
+                    == 'EpochBasedTrainLoop'
+    else:
+        raise ValueError('please set `train_cfg`.')
+
+    if args.dataset_size is None and by_epoch:
+        from mmaction.registry import DATASETS
+        dataset_size = len(DATASETS.build(cfg.train_dataloader.dataset))
+        print(f'dataset is {dataset_size}')
+    else:
+        dataset_size = args.dataset_size or batch_size
+
+    data_loader = DataLoader(range(dataset_size), batch_size)
+    assert len(data_loader) > 0, \
+        'Please decrease batchsize to make sure that ' \
+        'a epoch at least have one iteration!'
+    dataset_info = (
+        f'\nDataset infos:'
+        f'\n - Dataset size: {dataset_size}'
+        f'\n - Batch size per GPU: {cfg.train_dataloader.batch_size}'
+        f'\n - Number of GPUs: {args.ngpus}'
+        f'\n - Total batch size: {batch_size}')
+    if by_epoch:
+        dataset_info += f'\n - Iterations per epoch: {len(data_loader)}'
+    rich.print(dataset_info + '\n')
+
+    # simulation training process
+    lr_list, momentum_list = simulate_train(data_loader, cfg, by_epoch)
+    if args.parameter == 'lr':
+        param_list = lr_list
+    else:
+        param_list = momentum_list
+
+    param_name = 'Learning Rate' if args.parameter == 'lr' else 'Momentum'
+    plot_curve(param_list, args, param_name, len(data_loader), by_epoch)
+
+    if args.save_path:
+        plt.savefig(args.save_path)
+        print(f'\nThe {param_name} graph is saved at {args.save_path}')
+
+    if not args.not_show:
+        plt.show()
+
+
+if __name__ == '__main__':
+    main()

行为识别
C3D (CVPR'2014)	TSN (ECCV'2016)	I3D (CVPR'2017)	C2D (CVPR'2018)	I3D Non-Local (CVPR'2018)
R(2+1)D (CVPR'2018)	TRN (ECCV'2018)	TSM (ICCV'2019)	TSM Non-Local (ICCV'2019)	SlowOnly (ICCV'2019)
SlowFast (ICCV'2019)	CSN (ICCV'2019)	TIN (AAAI'2020)	TPN (CVPR'2020)	X3D (CVPR'2020)
MultiModality: Audio (ArXiv'2020)	TANet (ArXiv'2020)	TimeSformer (ICML'2021)	ActionCLIP (ArXiv'2021)	VideoSwin (CVPR'2022)
VideoMAE (NeurIPS'2022)	MViT V2 (CVPR'2022)	UniFormer V1 (ICLR'2022)	UniFormer V2 (Arxiv'2022)	VideoMAE V2 (CVPR'2023)
时序动作定位
BSN (ECCV'2018)	BMN (ICCV'2019)	TCANet (CVPR'2021)
时空行为检测
ACRN (ECCV'2018)	SlowOnly+Fast R-CNN (ICCV'2019)	SlowFast+Fast R-CNN (ICCV'2019)	LFB (CVPR'2019)	VideoMAE (NeurIPS'2022)
基于骨骼点的行为识别
ST-GCN (AAAI'2018)	2s-AGCN (CVPR'2019)	PoseC3D (CVPR'2022)	STGCN++ (ArXiv'2022)	CTRGCN (CVPR'2021)
MSG3D (CVPR'2020)
视频检索
CLIP4Clip (ArXiv'2022)
行为识别
HMDB51 (官网) (ICCV'2011)	UCF101 (官网) (CRCV-IR-12-01)	ActivityNet (官网) (CVPR'2015)	Kinetics-[400/600/700] (官网) (CVPR'2017)
SthV1 (ICCV'2017)	SthV2 (官网) (ICCV'2017)	Diving48 (官网) (ECCV'2018)	Jester (官网) (ICCV'2019)
Moments in Time (官网) (TPAMI'2019)	Multi-Moments in Time (官网) (ArXiv'2019)	HVU (官网) (ECCV'2020)	OmniSource (官网) (ECCV'2020)
FineGYM (官网) (CVPR'2020)	Kinetics-710 (官网) (Arxiv'2022)
时序动作定位
THUMOS14 (官网) (THUMOS Challenge 2014)	ActivityNet (官网) (CVPR'2015)	HACS (官网) (ICCV'2019)
时空行为检测
UCF101-24* (官网) (CRCV-IR-12-01)	JHMDB* (官网) (ICCV'2015)	AVA (官网) (CVPR'2018)	AVA-Kinetics (官网) (Arxiv'2020)
基于骨架的行为识别
PoseC3D-FineGYM (官网) (ArXiv'2021)	PoseC3D-NTURGB+D (官网) (ArXiv'2021)	PoseC3D-UCF101 (官网) (ArXiv'2021)	PoseC3D-HMDB51 (官网) (ArXiv'2021)
视频检索
MSRVTT (官网) (CVPR'2016)
Original	+ +```python +data = dict( + videos_per_gpu=32, + workers_per_gpu=2, + train=dict(...), + val=dict(...), + test=dict(...), +) +``` + +
New	+ +```python +train_dataloader = dict( + batch_size=32, + num_workers=2, + dataset=dict(...), + sampler=dict(type='DefaultSampler', shuffle=True) # necessary +) + +val_dataloader = dict( + batch_size=32, + num_workers=2, + dataset=dict(...), + sampler=dict(type='DefaultSampler', shuffle=False) # necessary +) + +test_dataloader = val_dataloader +``` + +
Original	+ +```python +evaluation = dict( + interval=5, + metrics=['top_k_accuracy', 'mean_class_accuracy']) +``` + +
New	+ +```python +val_evaluator = dict( + type='AccMetric', + metric_list=('top_k_accuracy', 'mean_class_accuracy')) +test_evaluator = val_evaluator +``` + +
Original	+ +```python +optimizer = dict( + type='AdamW', + lr=0.0015, + weight_decay=0.3, + paramwise_cfg = dict( + norm_decay_mult=0.0, + bias_decay_mult=0.0, + )) + +optimizer_config = dict(grad_clip=dict(max_norm=1.0)) +``` + +
New	+ +```python +optim_wrapper = dict( + optimizer=dict(type='AdamW', lr=0.0015, weight_decay=0.3), + paramwise_cfg = dict( + norm_decay_mult=0.0, + bias_decay_mult=0.0, + ), + clip_gard=dict(max_norm=1.0), +) +``` + +
Original	+ +```python +lr_config = dict( + policy='CosineAnnealing', + min_lr=0, + warmup='linear', + warmup_iters=5, + warmup_ratio=0.01, + warmup_by_epoch=True) +``` + +
New	+ +```python +param_scheduler = [ + # warmup + dict( + type='LinearLR', + start_factor=0.01, + by_epoch=True, + end=5, + # Update the learning rate after every iters. + convert_to_iter_based=True), + # main learning rate scheduler + dict(type='CosineAnnealingLR', by_epoch=True, begin=5), +] +``` + +
Original	+ +```python +runner = dict(type='EpochBasedRunner', max_epochs=100) +``` + +
New	+ +```python +# The `val_interval` is the original `evaluation.interval`. +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') # Use the default validation loop. +test_cfg = dict(type='TestLoop') # Use the default test loop. +``` + +
Original	+ +```python +log_config = dict( + interval=100, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook'), + ]) +``` + +
New	+ +```python +default_hooks = dict( + ... + logger=dict(type='LoggerHook', interval=100), +) + +visualizer = dict( + type='ActionVisualizer', + vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')], +) +``` + +
旧版本	+ +```python +data = dict( + videos_per_gpu=32, + workers_per_gpu=2, + train=dict(...), + val=dict(...), + test=dict(...), +) +``` + +
新版本	+ +```python +train_dataloader = dict( + batch_size=32, + num_workers=2, + dataset=dict(...), + sampler=dict(type='DefaultSampler', shuffle=True) # 必要 +) + +val_dataloader = dict( + batch_size=32, + num_workers=2, + dataset=dict(...), + sampler=dict(type='DefaultSampler', shuffle=False) # 必要 +) + +test_dataloader = val_dataloader +``` + +