diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..33c079f4ecf58dc57fd2d7ceb102267646d34957
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,28 @@
+
+.DS_Store
+.git/
+.idea/
+
+**/file_dir
+**/flagged/
+**/log/
+**/logs/
+**/__pycache__/
+
+/data/
+/docs/
+/dotenv/
+/examples/**/*.wav
+/hub_datasets/
+/trained_models*/
+/pretrained_models/
+/temp/
+
+**/*.csv
+**/*.onnx
+**/*.pdf
+**/*.md
+#**/*.wav
+**/*.xlsx
+**/*.jsonl
+**/*.zip
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..20a85a401d90f6602a79eca3bb2b2e49c5fe6891
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.12
+
+WORKDIR /code
+
+COPY . /code
+
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+
+RUN useradd -m -u 1000 user
+
+USER user
+
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+
+WORKDIR $HOME/app
+
+COPY --chown=user . $HOME/app
+
+CMD ["python3", "main.py"]
diff --git a/examples/download/download_space.py b/examples/download/download_space.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32ece7b167074304f584dcf543fe719b7d738cb
--- /dev/null
+++ b/examples/download/download_space.py
@@ -0,0 +1,50 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import os
+import platform
+
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+
+from huggingface_hub import snapshot_download
+
+from project_settings import project_path
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--repo_id", default="intelli-zen/music_comment", type=str)
+    parser.add_argument(
+        "--local_dir",
+        # default=(project_path / "temp/models" / "sft_llama2_stack_exchange").as_posix(),
+        # default=(project_path / "temp/spaces" / "keep_alive_a").as_posix(),
+        default=(project_path / "temp/datasets" / "music_comment").as_posix(),
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+
+    # export HF_ENDPOINT=https://hf-mirror.com
+
+    # 下载整个仓库
+    snapshot_download(
+        # repo_type="model",
+        # repo_type="space",
+        repo_type="dataset",
+        repo_id=args.repo_id,
+        local_dir=args.local_dir,
+        # ignore_patterns=["*.msgpack", "*.h5", "*.ot"],
+    )
+
+    # 或使用命令行
+    # pip install huggingface-hub
+    # huggingface-cli download 模型ID --local-dir ./model
+    return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/hub_download.py b/examples/hub_download.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a9c4fae839d03ecf08a562774c04db74ee5c61c
--- /dev/null
+++ b/examples/hub_download.py
@@ -0,0 +1,40 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import platform
+
+from huggingface_hub import snapshot_download
+
+from project_settings import project_path
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--repo_id", default="jingyaogong/MiniMind2", type=str)
+    parser.add_argument(
+        "--local_dir",
+        default=(project_path / "pretrained_models" / "MiniMind2").as_posix(),
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+
+    # 下载整个仓库
+    snapshot_download(
+        repo_id=args.repo_id,
+        local_dir=args.local_dir,
+        ignore_patterns=["*.msgpack", "*.h5", "*.ot"],
+    )
+
+    # 或使用命令行
+    # pip install huggingface-hub
+    # huggingface-cli download 模型ID --local-dir ./model
+    return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/playground/chat.py b/examples/playground/chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ace9336c264ca016e90a0ff2751aa5a2f8a6704
--- /dev/null
+++ b/examples/playground/chat.py
@@ -0,0 +1,118 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/jingyaogong/minimind/blob/master/eval_llm.py
+"""
+import argparse
+import time
+
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
+
+from project_settings import project_path
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        # default="jingyaogong/MiniMind2",
+        default=(project_path / "pretrained_models/MiniMind2"),
+        type=str
+    )
+
+    parser.add_argument(
+        "--max_new_tokens",
+        default=8192, # 8192, 128
+        type=int, help="最大生成长度（注意：并非模型实际长文本能力）"
+    )
+    parser.add_argument("--top_p", default=0.85, type=float, help="nucleus采样阈值（0-1）")
+    parser.add_argument("--temperature", default=0.85, type=float, help="生成温度，控制随机性（0-1，越大越随机）")
+
+    parser.add_argument(
+        "--show_speed",
+        default=1,  # 1, 0
+        type=int, help="显示decode速度（tokens/s）"
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        # device = "mps"
+        device = "cpu"
+    else:
+        device = "cpu"
+    print(f"device: {device}")
+
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
+    model = AutoModelForCausalLM.from_pretrained(args.pretrained_model_name_or_path)
+    model = model.eval().to(device)
+    # print(tokenizer)
+    # print(model)
+
+    prompts = [
+        "你有什么特长？",
+        "为什么天空是蓝色的",
+        "请用Python写一个计算斐波那契数列的函数",
+        '解释一下"光合作用"的基本过程',
+        "如果明天下雨，我应该如何出门",
+        "比较一下猫和狗作为宠物的优缺点",
+        "解释什么是机器学习",
+        "推荐一些中国的美食"
+    ]
+    input_mode = int(input("[0] 自动测试\n[1] 手动输入\n"))
+
+    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+    # conversation = list()
+    conversation = [
+        {"role": "system", "content": "You are a helpful assistant"}
+    ]
+    while True:
+        if input_mode == 0:
+            if len(prompts) == 0:
+                break
+            user_input = prompts.pop(0)
+            print(f"💬: {user_input}")
+        else:
+            user_input = input("💬: ")
+            user_input = str(user_input).strip()
+        conversation.append({"role": "user", "content": user_input})
+        inputs = tokenizer.apply_chat_template(
+            conversation=conversation,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        inputs = tokenizer.__call__(
+            inputs,
+            return_tensors="pt",
+            truncation=True
+        )
+        inputs = inputs.to(device)
+        # print(inputs)
+
+        print("🤖: ", end="")
+        st = time.time()
+        generated_ids = model.generate(
+            inputs=inputs["input_ids"], attention_mask=inputs["attention_mask"],
+            max_new_tokens=args.max_new_tokens, do_sample=True, streamer=streamer,
+            pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
+            top_p=args.top_p, temperature=args.temperature, repetition_penalty=1.0,
+        )
+        response = tokenizer.decode(generated_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
+        conversation.append({"role": "assistant", "content": response})
+        gen_tokens = len(generated_ids[0]) - len(inputs["input_ids"][0])
+        print(f"\n[Speed]: {gen_tokens / (time.time() - st):.2f} tokens/s\n\n") if args.show_speed else print("\n\n")
+
+    return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/install.sh b/install.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0e2af719112af27bb4b70fac93ebd7250f14eb53
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+# bash install.sh --stage 2 --stop_stage 2 --system_version centos
+
+
+python_version=3.8.10
+system_version="centos";
+
+verbose=true;
+stage=-1
+stop_stage=0
+
+
+# parse options
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
+      eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      old_value="(eval echo \\$$name)";
+      if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval "${name}=\"$2\"";
+
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+
+    *) break;
+  esac
+done
+
+work_dir="$(pwd)"
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  $verbose && echo "stage 1: install python"
+  cd "${work_dir}" || exit 1;
+
+  sh ./script/install_python.sh --python_version "${python_version}" --system_version "${system_version}"
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  $verbose && echo "stage 2: create virtualenv"
+
+  # /usr/local/python-3.9.9/bin/virtualenv cc_audio_8
+  # source /data/local/bin/cc_audio_8/bin/activate
+  /usr/local/python-${python_version}/bin/pip3 install virtualenv
+  mkdir -p /data/local/bin
+  cd /data/local/bin || exit 1;
+  /usr/local/python-${python_version}/bin/virtualenv cc_audio_8
+
+fi
diff --git a/log.py b/log.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6db26016c4280627d01ab340ab4e75878a6491e
--- /dev/null
+++ b/log.py
@@ -0,0 +1,257 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from datetime import datetime
+import logging
+from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
+import os
+from zoneinfo import ZoneInfo  # Python 3.9+ 自带，无需安装
+
+
+def get_converter(tz_info: str = "Asia/Shanghai"):
+    def converter(timestamp):
+        dt = datetime.fromtimestamp(timestamp, ZoneInfo(tz_info))
+        result = dt.timetuple()
+        return result
+    return converter
+
+
+def setup_stream(tz_info: str = "Asia/Shanghai"):
+    fmt = "%(asctime)s|%(name)s|%(levelname)s|%(filename)s|%(lineno)d|%(message)s"
+
+    formatter = logging.Formatter(
+        fmt=fmt,
+        datefmt="%Y-%m-%d %H:%M:%S %z"
+    )
+    formatter.converter = get_converter(tz_info)
+
+    stream_handler = logging.StreamHandler()
+    stream_handler.setLevel(logging.INFO)
+    stream_handler.setFormatter(formatter)
+
+    # main
+    main_logger = logging.getLogger("main")
+    main_logger.addHandler(stream_handler)
+
+    # http
+    http_logger = logging.getLogger("http")
+    http_logger.addHandler(stream_handler)
+
+    # api
+    api_logger = logging.getLogger("api")
+    api_logger.addHandler(stream_handler)
+
+    logging.basicConfig(
+        level=logging.DEBUG,
+        datefmt="%a, %d %b %Y %H:%M:%S",
+        handlers=[
+
+        ]
+    )
+    return
+
+
+def setup_size_rotating(log_directory: str, tz_info: str = "Asia/Shanghai"):
+    fmt = "%(asctime)s|%(name)s|%(levelname)s|%(filename)s|%(lineno)d|%(message)s"
+
+    formatter = logging.Formatter(
+        fmt=fmt,
+        datefmt="%Y-%m-%d %H:%M:%S %z"
+    )
+    formatter.converter = get_converter(tz_info)
+
+    stream_handler = logging.StreamHandler()
+    stream_handler.setLevel(logging.INFO)
+    stream_handler.setFormatter(formatter)
+
+    # main
+    main_logger = logging.getLogger("main")
+    main_logger.addHandler(stream_handler)
+    main_info_file_handler = RotatingFileHandler(
+        filename=os.path.join(log_directory, "main.log"),
+        maxBytes=100*1024*1024,  # 100MB
+        encoding="utf-8",
+        backupCount=2,
+    )
+    main_info_file_handler.setLevel(logging.INFO)
+    main_info_file_handler.setFormatter(formatter)
+    main_logger.addHandler(main_info_file_handler)
+
+    # http
+    http_logger = logging.getLogger("http")
+    http_logger.addHandler(stream_handler)
+    http_file_handler = RotatingFileHandler(
+        filename=os.path.join(log_directory, "http.log"),
+        maxBytes=100*1024*1024,  # 100MB
+        encoding="utf-8",
+        backupCount=2,
+    )
+    http_file_handler.setLevel(logging.DEBUG)
+    http_file_handler.setFormatter(formatter)
+    http_logger.addHandler(http_file_handler)
+
+    # api
+    api_logger = logging.getLogger("api")
+    api_logger.addHandler(stream_handler)
+    api_file_handler = RotatingFileHandler(
+        filename=os.path.join(log_directory, "api.log"),
+        maxBytes=10*1024*1024,  # 10MB
+        encoding="utf-8",
+        backupCount=2,
+    )
+    api_file_handler.setLevel(logging.DEBUG)
+    api_file_handler.setFormatter(formatter)
+    api_logger.addHandler(api_file_handler)
+
+    # alarm
+    alarm_logger = logging.getLogger("alarm")
+    alarm_file_handler = RotatingFileHandler(
+        filename=os.path.join(log_directory, "alarm.log"),
+        maxBytes=1*1024*1024,  # 1MB
+        encoding="utf-8",
+        backupCount=2,
+    )
+    alarm_file_handler.setLevel(logging.DEBUG)
+    alarm_file_handler.setFormatter(formatter)
+    alarm_logger.addHandler(alarm_file_handler)
+
+    debug_file_handler = RotatingFileHandler(
+        filename=os.path.join(log_directory, "debug.log"),
+        maxBytes=1*1024*1024,  # 1MB
+        encoding="utf-8",
+        backupCount=2,
+    )
+    debug_file_handler.setLevel(logging.DEBUG)
+    debug_file_handler.setFormatter(formatter)
+
+    info_file_handler = RotatingFileHandler(
+        filename=os.path.join(log_directory, "info.log"),
+        maxBytes=1*1024*1024,  # 1MB
+        encoding="utf-8",
+        backupCount=2,
+    )
+    info_file_handler.setLevel(logging.INFO)
+    info_file_handler.setFormatter(formatter)
+
+    error_file_handler = RotatingFileHandler(
+        filename=os.path.join(log_directory, "error.log"),
+        maxBytes=1*1024*1024,  # 1MB
+        encoding="utf-8",
+        backupCount=2,
+    )
+    error_file_handler.setLevel(logging.ERROR)
+    error_file_handler.setFormatter(formatter)
+
+    logging.basicConfig(
+        level=logging.DEBUG,
+        datefmt="%a, %d %b %Y %H:%M:%S",
+        handlers=[
+            debug_file_handler,
+            info_file_handler,
+            error_file_handler,
+        ]
+    )
+
+
+def setup_time_rotating(log_directory: str):
+    fmt = "%(asctime)s - %(name)s - %(levelname)s  %(filename)s:%(lineno)d >  %(message)s"
+
+    stream_handler = logging.StreamHandler()
+    stream_handler.setLevel(logging.INFO)
+    stream_handler.setFormatter(logging.Formatter(fmt))
+
+    # main
+    main_logger = logging.getLogger("main")
+    main_logger.addHandler(stream_handler)
+    main_info_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "main.log"),
+        encoding="utf-8",
+        when="midnight",
+        interval=1,
+        backupCount=7
+    )
+    main_info_file_handler.setLevel(logging.INFO)
+    main_info_file_handler.setFormatter(logging.Formatter(fmt))
+    main_logger.addHandler(main_info_file_handler)
+
+    # http
+    http_logger = logging.getLogger("http")
+    http_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "http.log"),
+        encoding='utf-8',
+        when="midnight",
+        interval=1,
+        backupCount=7
+    )
+    http_file_handler.setLevel(logging.DEBUG)
+    http_file_handler.setFormatter(logging.Formatter(fmt))
+    http_logger.addHandler(http_file_handler)
+
+    # api
+    api_logger = logging.getLogger("api")
+    api_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "api.log"),
+        encoding='utf-8',
+        when="midnight",
+        interval=1,
+        backupCount=7
+    )
+    api_file_handler.setLevel(logging.DEBUG)
+    api_file_handler.setFormatter(logging.Formatter(fmt))
+    api_logger.addHandler(api_file_handler)
+
+    # alarm
+    alarm_logger = logging.getLogger("alarm")
+    alarm_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "alarm.log"),
+        encoding="utf-8",
+        when="midnight",
+        interval=1,
+        backupCount=7
+    )
+    alarm_file_handler.setLevel(logging.DEBUG)
+    alarm_file_handler.setFormatter(logging.Formatter(fmt))
+    alarm_logger.addHandler(alarm_file_handler)
+
+    debug_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "debug.log"),
+        encoding="utf-8",
+        when="D",
+        interval=1,
+        backupCount=7
+    )
+    debug_file_handler.setLevel(logging.DEBUG)
+    debug_file_handler.setFormatter(logging.Formatter(fmt))
+
+    info_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "info.log"),
+        encoding="utf-8",
+        when="D",
+        interval=1,
+        backupCount=7
+    )
+    info_file_handler.setLevel(logging.INFO)
+    info_file_handler.setFormatter(logging.Formatter(fmt))
+
+    error_file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(log_directory, "error.log"),
+        encoding="utf-8",
+        when="D",
+        interval=1,
+        backupCount=7
+    )
+    error_file_handler.setLevel(logging.ERROR)
+    error_file_handler.setFormatter(logging.Formatter(fmt))
+
+    logging.basicConfig(
+        level=logging.DEBUG,
+        datefmt="%a, %d %b %Y %H:%M:%S",
+        handlers=[
+            debug_file_handler,
+            info_file_handler,
+            error_file_handler,
+        ]
+    )
+
+
+if __name__ == "__main__":
+    pass
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..b56c69556980a9f1742f2c6a109378e206e2cbe9
--- /dev/null
+++ b/main.py
@@ -0,0 +1,16 @@
+# 这是一个示例 Python 脚本。
+
+# 按 ⌃R 执行或将其替换为您的代码。
+# 按 双击 ⇧ 在所有地方搜索类、文件、工具窗口、操作和设置。
+
+
+def print_hi(name):
+    # 在下面的代码行中使用断点来调试脚本。
+    print(f'Hi, {name}')  # 按 ⌘F8 切换断点。
+
+
+# 按装订区域中的绿色按钮以运行脚本。
+if __name__ == '__main__':
+    print_hi('PyCharm')
+
+# 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助
diff --git a/project_settings.py b/project_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..1448bf339b0fdf3f0e675498f2df5fbd6a87ac32
--- /dev/null
+++ b/project_settings.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import os
+from pathlib import Path
+
+from toolbox.os.environment import EnvironmentManager
+
+
+project_path = os.path.abspath(os.path.dirname(__file__))
+project_path = Path(project_path)
+
+log_directory = project_path / "logs"
+log_directory.mkdir(parents=True, exist_ok=True)
+
+temp_directory = project_path / "temp"
+temp_directory.mkdir(parents=True, exist_ok=True)
+
+environment = EnvironmentManager(
+    path=os.path.join(project_path, "dotenv"),
+    env=os.environ.get("environment", "dev"),
+)
+
+
+if __name__ == "__main__":
+    pass
diff --git a/script/install_nvidia_driver.sh b/script/install_nvidia_driver.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c3bdb66d371c4687d5d0bc8cff223b3f529a301b
--- /dev/null
+++ b/script/install_nvidia_driver.sh
@@ -0,0 +1,184 @@
+#!/usr/bin/env bash
+#GPU驱动安装需要先将原有的显示关闭, 重启机器, 再进行安装.
+#参考链接:
+#https://blog.csdn.net/kingschan/article/details/19033595
+#https://blog.csdn.net/HaixWang/article/details/90408538
+#
+#>>> yum install -y pciutils
+#查看 linux 机器上是否有 GPU
+#lspci |grep -i nvidia
+#
+#>>> lspci |grep -i nvidia
+#00:08.0 3D controller: NVIDIA Corporation TU104GL [Tesla T4] (rev a1)
+#
+#
+#NVIDIA 驱动程序下载
+#先在 pytorch 上查看应该用什么 cuda 版本, 再安装对应的 cuda-toolkit cuda.
+#再根据 gpu 版本下载安装对应的 nvidia 驱动
+#
+## pytorch 版本
+#https://pytorch.org/get-started/locally/
+#
+## CUDA 下载 (好像不需要这个)
+#https://developer.nvidia.com/cuda-toolkit-archive
+#
+## nvidia 驱动
+#https://www.nvidia.cn/Download/index.aspx?lang=cn
+#http://www.nvidia.com/Download/index.aspx
+#
+#在下方的下拉列表中进行选择，针对您的 NVIDIA 产品确定合适的驱动。
+#产品类型:
+#Data Center / Tesla
+#产品系列:
+#T-Series
+#产品家族:
+#Tesla T4
+#操作系统:
+#Linux 64-bit
+#CUDA Toolkit:
+#10.2
+#语言:
+#Chinese (Simpleified)
+#
+#
+#>>> mkdir -p /data/tianxing
+#>>> cd /data/tianxing
+#>>> wget https://cn.download.nvidia.com/tesla/440.118.02/NVIDIA-Linux-x86_64-440.118.02.run
+#>>> sh NVIDIA-Linux-x86_64-440.118.02.run
+#
+## 异常:
+#ERROR: The Nouveau kernel driver is currently in use by your system.  This driver is incompatible with the NVIDIA driver, and must be disabled before proceeding.  Please consult the NVIDIA driver README and your
+#Linux distribution's documentation for details on how to correctly disable the Nouveau kernel driver.
+#[OK]
+#
+#For some distributions, Nouveau can be disabled by adding a file in the modprobe configuration directory.  Would you like nvidia-installer to attempt to create this modprobe file for you?
+#[NO]
+#
+#ERROR: Installation has failed.  Please see the file '/var/log/nvidia-installer.log' for details.  You may find suggestions on fixing installation problems in the README available on the Linux driver download
+#page at www.nvidia.com.
+#[OK]
+#
+## 参考链接:
+#https://blog.csdn.net/kingschan/article/details/19033595
+#
+## 禁用原有的显卡驱动 nouveau
+#>>> echo -e "blacklist nouveau\noptions nouveau modeset=0\n" > /etc/modprobe.d/blacklist-nouveau.conf
+#>>> sudo dracut --force
+## 重启
+#>>> reboot
+#
+#>>> init 3
+#>>> sh NVIDIA-Linux-x86_64-440.118.02.run
+#
+## 异常
+#ERROR: Unable to find the kernel source tree for the currently running kernel. Please make sure you have installed the kernel source files for your kernel and that they are properly configured; on Red Hat Linux systems, for example, be sure you have the 'kernel-source' or 'kernel-devel' RPM installed. If you know the correct kernel source files are installed, you may specify the kernel source path with the '--kernel-source-path' command line option.
+#[OK]
+#ERROR: Installation has failed.  Please see the file '/var/log/nvidia-installer.log' for details.  You may find suggestions on fixing installation problems in the README available on the Linux driver download
+#page at www.nvidia.com.
+#[OK]
+#
+## 参考链接
+## https://blog.csdn.net/HaixWang/article/details/90408538
+#
+#>>> uname -r
+#3.10.0-1160.49.1.el7.x86_64
+#>>> yum install kernel-devel kernel-headers -y
+#>>> yum info kernel-devel kernel-headers
+#>>> yum install -y "kernel-devel-uname-r == $(uname -r)"
+#>>> yum -y distro-sync
+#
+#>>> sh NVIDIA-Linux-x86_64-440.118.02.run
+#
+## 安装成功
+#WARNING: nvidia-installer was forced to guess the X library path '/usr/lib64' and X module path '/usr/lib64/xorg/modules'; these paths were not queryable from the system.  If X fails to find the NVIDIA X driver
+#module, please install the `pkg-config` utility and the X.Org SDK/development package for your distribution and reinstall the driver.
+#[OK]
+#Install NVIDIA's 32-bit compatibility libraries?
+#[YES]
+#Installation of the kernel module for the NVIDIA Accelerated Graphics Driver for Linux-x86_64 (version 440.118.02) is now complete.
+#[OK]
+#
+#
+## 查看 GPU 使用情况; watch -n 1 -d nvidia-smi 每1秒刷新一次.
+#>>> nvidia-smi
+#Thu Mar  9 12:00:37 2023
+#+-----------------------------------------------------------------------------+
+#| NVIDIA-SMI 440.118.02   Driver Version: 440.118.02   CUDA Version: 10.2     |
+#|-------------------------------+----------------------+----------------------+
+#| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+#| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+#|===============================+======================+======================|
+#|   0  Tesla T4            Off  | 00000000:00:08.0 Off |                  Off |
+#| N/A   54C    P0    22W /  70W |      0MiB / 16127MiB |      0%      Default |
+#+-------------------------------+----------------------+----------------------+
+#
+#+-----------------------------------------------------------------------------+
+#| Processes:                                                       GPU Memory |
+#|  GPU       PID   Type   Process name                             Usage      |
+#|=============================================================================|
+#|  No running processes found                                                 |
+#+-----------------------------------------------------------------------------+
+#
+#
+
+# params
+stage=1
+nvidia_driver_filename=https://cn.download.nvidia.com/tesla/440.118.02/NVIDIA-Linux-x86_64-440.118.02.run
+
+# parse options
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
+      eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      old_value="(eval echo \\$$name)";
+      if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval "${name}=\"$2\"";
+
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+
+    *) break;
+  esac
+done
+
+echo "stage: ${stage}";
+
+yum -y install wget
+yum -y install sudo
+
+if [ ${stage} -eq 0 ]; then
+  mkdir -p /data/dep
+  cd /data/dep || echo 1;
+  wget -P /data/dep ${nvidia_driver_filename}
+
+  echo -e "blacklist nouveau\noptions nouveau modeset=0\n" > /etc/modprobe.d/blacklist-nouveau.conf
+  sudo dracut --force
+  # 重启
+  reboot
+elif [ ${stage} -eq 1 ]; then
+  init 3
+
+  yum install -y kernel-devel kernel-headers
+  yum info kernel-devel kernel-headers
+  yum install -y "kernel-devel-uname-r == $(uname -r)"
+  yum -y distro-sync
+
+  cd /data/dep || echo 1;
+
+  # 安装时, 需要回车三下.
+  sh NVIDIA-Linux-x86_64-440.118.02.run
+  nvidia-smi
+fi
diff --git a/script/install_python.sh b/script/install_python.sh
new file mode 100644
index 0000000000000000000000000000000000000000..67027ffd217a5ec322cf887797212ed04fd8dd46
--- /dev/null
+++ b/script/install_python.sh
@@ -0,0 +1,129 @@
+#!/usr/bin/env bash
+
+# 参数:
+python_version="3.6.5";
+system_version="centos";
+
+
+# parse options
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
+      eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      old_value="(eval echo \\$$name)";
+      if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval "${name}=\"$2\"";
+
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+
+    *) break;
+  esac
+done
+
+echo "python_version: ${python_version}";
+echo "system_version: ${system_version}";
+
+
+if [ ${system_version} = "centos" ]; then
+  # 安装 python 开发编译环境
+  yum -y groupinstall "Development tools"
+  yum -y install zlib-devel bzip2-devel openssl-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel
+  yum install libffi-devel -y
+  yum install -y wget
+  yum install -y make
+
+  mkdir -p /data/dep
+  cd /data/dep || exit 1;
+  if [ ! -e Python-${python_version}.tgz ]; then
+    wget -P /data/dep https://www.python.org/ftp/python/${python_version}/Python-${python_version}.tgz
+  fi
+
+  cd /data/dep || exit 1;
+  if [ ! -d Python-${python_version} ]; then
+    tar -zxvf Python-${python_version}.tgz
+    cd /data/dep/Python-${python_version} || exit 1;
+  fi
+
+  mkdir /usr/local/python-${python_version}
+  ./configure --prefix=/usr/local/python-${python_version}
+  make && make install
+
+  /usr/local/python-${python_version}/bin/python3 -V
+  /usr/local/python-${python_version}/bin/pip3 -V
+
+  rm -rf /usr/local/bin/python3
+  rm -rf /usr/local/bin/pip3
+  ln -s /usr/local/python-${python_version}/bin/python3 /usr/local/bin/python3
+  ln -s /usr/local/python-${python_version}/bin/pip3 /usr/local/bin/pip3
+
+  python3 -V
+  pip3 -V
+
+elif [ ${system_version} = "ubuntu" ]; then
+  # 安装 python 开发编译环境
+  # https://zhuanlan.zhihu.com/p/506491209
+
+  # 刷新软件包目录
+  sudo apt update
+  # 列出当前可用的更新
+  sudo apt list --upgradable
+  # 如上一步提示有可以更新的项目，则执行更新
+  sudo apt -y upgrade
+  # 安装 GCC 编译器
+  sudo apt install gcc
+  # 检查安装是否成功
+  gcc -v
+
+  # 安装依赖
+  sudo apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libbz2-dev liblzma-dev sqlite3 libsqlite3-dev tk-dev uuid-dev libgdbm-compat-dev
+
+  mkdir -p /data/dep
+  cd /data/dep || exit 1;
+  if [ ! -e Python-${python_version}.tgz ]; then
+    # sudo wget -P /data/dep https://www.python.org/ftp/python/3.6.5/Python-3.6.5.tgz
+    sudo wget -P /data/dep https://www.python.org/ftp/python/${python_version}/Python-${python_version}.tgz
+  fi
+
+  cd /data/dep || exit 1;
+  if [ ! -d Python-${python_version} ]; then
+    # tar -zxvf Python-3.6.5.tgz
+    tar -zxvf Python-${python_version}.tgz
+    # cd /data/dep/Python-3.6.5
+    cd /data/dep/Python-${python_version} || exit 1;
+  fi
+
+  # mkdir /usr/local/python-3.6.5
+  mkdir /usr/local/python-${python_version}
+
+  # 检查依赖与配置编译
+  # sudo ./configure --prefix=/usr/local/python-3.6.5 --enable-optimizations --with-lto --enable-shared
+  sudo ./configure --prefix=/usr/local/python-${python_version} --enable-optimizations --with-lto --enable-shared
+  cpu_count=$(cat /proc/cpuinfo | grep processor | wc -l)
+  # sudo make -j 4
+  sudo make -j "${cpu_count}"
+
+  /usr/local/python-${python_version}/bin/python3 -V
+  /usr/local/python-${python_version}/bin/pip3 -V
+
+  rm -rf /usr/local/bin/python3
+  rm -rf /usr/local/bin/pip3
+  ln -s /usr/local/python-${python_version}/bin/python3 /usr/local/bin/python3
+  ln -s /usr/local/python-${python_version}/bin/pip3 /usr/local/bin/pip3
+
+  python3 -V
+  pip3 -V
+fi
diff --git a/toolbox/__init__.py b/toolbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aad738e112896111c38ae6624c8632aee62a234
--- /dev/null
+++ b/toolbox/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/cv2/__init__.py b/toolbox/cv2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bc5155c67cae42f80e8126d1727b0edc1e02398
--- /dev/null
+++ b/toolbox/cv2/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/cv2/misc.py b/toolbox/cv2/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..28b2e735b162e15a596d47b33d4398721a37e2d2
--- /dev/null
+++ b/toolbox/cv2/misc.py
@@ -0,0 +1,137 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import List, Union
+
+
+def erode(labels: List[Union[str, int]], erode_label: Union[str, int], n: int = 1):
+    """
+    遍历 labels 列表, 将连续的 erode_label 标签侵蚀 n 个.
+    """
+    result = list()
+    in_span = False
+    count = 0
+    for idx, label in enumerate(labels):
+        if label == erode_label:
+            if not in_span:
+                in_span = True
+                count = 0
+            if count < n:
+                if len(result) == 0:
+                    result.append(label)
+                else:
+                    result.append(result[-1])
+                count += 1
+                continue
+            else:
+                result.append(label)
+                continue
+        elif label != erode_label:
+            if in_span:
+                in_span = False
+
+                for i in range(min(len(result), n)):
+                    result[-i-1] = label
+                result.append(label)
+                continue
+            else:
+                result.append(label)
+                continue
+
+        result.append(label)
+    return result
+
+
+def dilate(labels: List[Union[str, int]], dilate_label: Union[str, int], n: int = 1):
+    """
+    遍历 labels 列表, 将连续的 dilate_label 标签扩张 n 个.
+    """
+    result = list()
+    in_span = False
+    count = float('inf')
+    for idx, label in enumerate(labels):
+        if count < n:
+            result.append(dilate_label)
+            count += 1
+            continue
+        if label == dilate_label:
+            if not in_span:
+                in_span = True
+
+                for i in range(min(len(result), n)):
+                    result[-i-1] = label
+                result.append(label)
+                continue
+            else:
+                result.append(label)
+                continue
+        else:
+            if in_span:
+                in_span = False
+                result.append(dilate_label)
+                count = 1
+                continue
+            else:
+                result.append(label)
+                continue
+
+    return result
+
+
+def demo1():
+    labels = [
+        'voice', 'mute', 'mute', 'voice', 'voice', 'voice', 'voice', 'bell', 'bell', 'bell', 'mute', 'mute', 'mute', 'voice',
+    ]
+
+    result = erode(
+        labels=labels,
+        erode_label='voice',
+        n=1,
+
+    )
+    print(len(labels))
+    print(len(result))
+    print(result)
+    return
+
+
+def demo2():
+    labels = [
+        'voice', 'mute', 'mute', 'voice', 'voice', 'voice', 'voice', 'bell', 'bell', 'bell', 'mute', 'mute', 'mute', 'voice',
+    ]
+
+    result = dilate(
+        labels=labels,
+        dilate_label='voice',
+        n=2,
+
+    )
+    print(len(labels))
+    print(len(result))
+    print(result)
+
+    return
+
+
+def demo3():
+    import time
+    labels = ['mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'voice', 'bell', 'bell', 'bell', 'bell', 'bell', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'bell', 'bell', 'bell', 'bell', 'bell', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'bell', 'bell', 'bell', 'bell', 'bell', 'bell', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute', 'mute']
+
+    begin = time.time()
+    labels = erode(labels, erode_label='music', n=1)
+    labels = dilate(labels, dilate_label='music', n=1)
+
+    labels = dilate(labels, dilate_label='voice', n=2)
+    labels = erode(labels, erode_label='voice', n=2)
+    labels = erode(labels, erode_label='voice', n=1)
+    labels = dilate(labels, dilate_label='voice', n=3)
+
+    cost = time.time() - begin
+    print(cost)
+    print(labels)
+    return
+
+
+if __name__ == '__main__':
+    # demo1()
+    # demo2()
+    demo3()
diff --git a/toolbox/json/__init__.py b/toolbox/json/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bc5155c67cae42f80e8126d1727b0edc1e02398
--- /dev/null
+++ b/toolbox/json/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/json/misc.py b/toolbox/json/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..56022e111a555aa370ff833f4ee68f880d183ed9
--- /dev/null
+++ b/toolbox/json/misc.py
@@ -0,0 +1,63 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Callable
+
+
+def traverse(js, callback: Callable, *args, **kwargs):
+    if isinstance(js, list):
+        result = list()
+        for l in js:
+            l = traverse(l, callback, *args, **kwargs)
+            result.append(l)
+        return result
+    elif isinstance(js, tuple):
+        result = list()
+        for l in js:
+            l = traverse(l, callback, *args, **kwargs)
+            result.append(l)
+        return tuple(result)
+    elif isinstance(js, dict):
+        result = dict()
+        for k, v in js.items():
+            k = traverse(k, callback, *args, **kwargs)
+            v = traverse(v, callback, *args, **kwargs)
+            result[k] = v
+        return result
+    elif isinstance(js, int):
+        return callback(js, *args, **kwargs)
+    elif isinstance(js, str):
+        return callback(js, *args, **kwargs)
+    else:
+        return js
+
+
+def demo1():
+    d = {
+        "env": "ppe",
+        "mysql_connect": {
+            "host": "$mysql_connect_host",
+            "port": 3306,
+            "user": "callbot",
+            "password": "NxcloudAI2021!",
+            "database": "callbot_ppe",
+            "charset": "utf8"
+        },
+        "es_connect": {
+            "hosts": ["10.20.251.8"],
+            "http_auth": ["elastic", "ElasticAI2021!"],
+            "port": 9200
+        }
+    }
+
+    def callback(s):
+        if isinstance(s, str) and s.startswith('$'):
+            return s[1:]
+        return s
+
+    result = traverse(d, callback=callback)
+    print(result)
+    return
+
+
+if __name__ == '__main__':
+    demo1()
diff --git a/toolbox/minimind/__init__.py b/toolbox/minimind/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4209b08e9cf300013cce3952d01b0477682da8e
--- /dev/null
+++ b/toolbox/minimind/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/minimind/model/__init__.py b/toolbox/minimind/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4209b08e9cf300013cce3952d01b0477682da8e
--- /dev/null
+++ b/toolbox/minimind/model/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/minimind/model/configuration_minimind.py b/toolbox/minimind/model/configuration_minimind.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ff5e04c2b21888b4744c234a38127bd14e9599
--- /dev/null
+++ b/toolbox/minimind/model/configuration_minimind.py
@@ -0,0 +1,80 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from transformers import PretrainedConfig
+
+
+class MiniMindConfig(PretrainedConfig):
+    model_type = "minimind"
+
+    def __init__(
+            self,
+            dropout: float = 0.0,
+            bos_token_id: int = 1,
+            eos_token_id: int = 2,
+            hidden_act: str = 'silu',
+            hidden_size: int = 512,
+            intermediate_size: int = None,
+            max_position_embeddings: int = 32768,
+            num_attention_heads: int = 8,
+            num_hidden_layers: int = 8,
+            num_key_value_heads: int = 2,
+            vocab_size: int = 6400,
+            rms_norm_eps: float = 1e-05,
+            rope_theta: int = 1000000.0,
+            inference_rope_scaling: bool = False,
+            flash_attn: bool = True,
+            ####################################################
+            # Here are the specific configurations of MOE
+            # When use_moe is false, the following is invalid
+            ####################################################
+            use_moe: bool = False,
+            num_experts_per_tok: int = 2,
+            n_routed_experts: int = 4,
+            n_shared_experts: int = 1,
+            scoring_func: str = 'softmax',
+            aux_loss_alpha: float = 0.01,
+            seq_aux: bool = True,
+            norm_topk_prob: bool = True,
+            **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.dropout = dropout
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.num_key_value_heads = num_key_value_heads
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+        self.inference_rope_scaling = inference_rope_scaling
+        # 外推长度 = factor * original_max_position_embeddings = 32768
+        self.rope_scaling = {
+            "beta_fast": 32,
+            "beta_slow": 1,
+            "factor": 16,
+            "original_max_position_embeddings": 2048,
+            "attention_factor": 1.0,
+            "type": "yarn"
+        } if self.inference_rope_scaling else None
+        self.flash_attn = flash_attn
+        ####################################################
+        # Here are the specific configurations of MOE
+        # When use_moe is false, the following is invalid
+        ####################################################
+        self.use_moe = use_moe
+        self.num_experts_per_tok = num_experts_per_tok  # 每个token选择的专家数量
+        self.n_routed_experts = n_routed_experts  # 总的专家数量
+        self.n_shared_experts = n_shared_experts  # 共享专家
+        self.scoring_func = scoring_func  # 评分函数，默认为'softmax'
+        self.aux_loss_alpha = aux_loss_alpha  # 辅助损失的alpha参数
+        self.seq_aux = seq_aux  # 是否在序列级别上计算辅助损失
+        self.norm_topk_prob = norm_topk_prob  # 是否标准化top-k概率
+
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/minimind/model/modeling_minimind.py b/toolbox/minimind/model/modeling_minimind.py
new file mode 100644
index 0000000000000000000000000000000000000000..b356c2f36e868df42a9fae92fda35741481278e5
--- /dev/null
+++ b/toolbox/minimind/model/modeling_minimind.py
@@ -0,0 +1,386 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import math
+import torch
+import torch.nn.init as init
+import torch.nn.functional as F
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, Tuple, List, Union
+from transformers import PreTrainedModel, GenerationMixin, PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from toolbox.minimind.model.configuration_minimind import MiniMindConfig
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        return self.weight * self._norm(x.float()).type_as(x)
+
+
+def precompute_freqs_cis(dim: int, end: int = int(32 * 1024), rope_base: float = 1e6,
+                         rope_scaling: Optional[dict] = None):
+    freqs, attn_factor = 1.0 / (rope_base ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)), 1.0
+    if rope_scaling is not None:
+        orig_max, factor, beta_fast, beta_slow, attn_factor = (
+            rope_scaling.get("original_max_position_embeddings", 2048), rope_scaling.get("factor", 16),
+            rope_scaling.get("beta_fast", 32.0), rope_scaling.get("beta_slow", 1.0), rope_scaling.get("attention_factor", 1.0)
+        )
+        if end / orig_max > 1.0:
+            # YaRN: f'(i) = f(i)((1-γ) + γ/s), where γ∈[0,1] is linear ramp
+            inv_dim = lambda b: (dim * math.log(orig_max / (b * 2 * math.pi))) / (2 * math.log(rope_base))
+            low, high = max(math.floor(inv_dim(beta_fast)), 0), min(math.ceil(inv_dim(beta_slow)), dim // 2 - 1)
+            ramp = torch.clamp((torch.arange(dim // 2, device=freqs.device).float() - low) / max(high - low, 0.001), 0, 1)
+            freqs = freqs * (1 - ramp + ramp / factor)
+
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs).float()
+    freqs_cos = torch.cat([torch.cos(freqs), torch.cos(freqs)], dim=-1) * attn_factor
+    freqs_sin = torch.cat([torch.sin(freqs), torch.sin(freqs)], dim=-1) * attn_factor
+    return freqs_cos, freqs_sin
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    def rotate_half(x):
+        return torch.cat((-x[..., x.shape[-1] // 2:], x[..., : x.shape[-1] // 2]), dim=-1)
+
+    q_embed = (q * cos.unsqueeze(unsqueeze_dim)) + (rotate_half(q) * sin.unsqueeze(unsqueeze_dim))
+    k_embed = (k * cos.unsqueeze(unsqueeze_dim)) + (rotate_half(k) * sin.unsqueeze(unsqueeze_dim))
+    return q_embed, k_embed
+
+
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, slen, num_key_value_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, :, None, :].expand(bs, slen, num_key_value_heads, n_rep, head_dim).reshape(bs, slen, num_key_value_heads * n_rep, head_dim)
+    )
+
+
+class Attention(nn.Module):
+    def __init__(self, args: MiniMindConfig):
+        super().__init__()
+        self.num_key_value_heads = args.num_attention_heads if args.num_key_value_heads is None else args.num_key_value_heads
+        assert args.num_attention_heads % self.num_key_value_heads == 0
+        self.n_local_heads = args.num_attention_heads
+        self.n_local_kv_heads = self.num_key_value_heads
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.hidden_size // args.num_attention_heads
+        self.q_proj = nn.Linear(args.hidden_size, args.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(args.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(args.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(args.num_attention_heads * self.head_dim, args.hidden_size, bias=False)
+        self.attn_dropout = nn.Dropout(args.dropout)
+        self.resid_dropout = nn.Dropout(args.dropout)
+        self.dropout = args.dropout
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') and args.flash_attn
+        # print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
+
+    def forward(self,
+                x: torch.Tensor,
+                position_embeddings: Tuple[torch.Tensor, torch.Tensor],  # 修改为接收cos和sin
+                past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+                use_cache=False,
+                attention_mask: Optional[torch.Tensor] = None):
+        bsz, seq_len, _ = x.shape
+        xq, xk, xv = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+        xq = xq.view(bsz, seq_len, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seq_len, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seq_len, self.n_local_kv_heads, self.head_dim)
+
+        cos, sin = position_embeddings
+        xq, xk = apply_rotary_pos_emb(xq, xk, cos, sin)
+
+        # kv_cache实现
+        if past_key_value is not None:
+            xk = torch.cat([past_key_value[0], xk], dim=1)
+            xv = torch.cat([past_key_value[1], xv], dim=1)
+        past_kv = (xk, xv) if use_cache else None
+
+        xq, xk, xv = (
+            xq.transpose(1, 2),
+            repeat_kv(xk, self.n_rep).transpose(1, 2),
+            repeat_kv(xv, self.n_rep).transpose(1, 2)
+        )
+
+        if self.flash and (seq_len > 1) and (past_key_value is None) and (attention_mask is None or torch.all(attention_mask == 1)):
+            output = F.scaled_dot_product_attention(xq, xk, xv, dropout_p=self.dropout if self.training else 0.0, is_causal=True)
+        else:
+            scores = (xq @ xk.transpose(-2, -1)) / math.sqrt(self.head_dim)
+            scores[:, :, :, -seq_len:] += torch.triu(torch.full((seq_len, seq_len), float("-inf"), device=scores.device), diagonal=1)
+
+            if attention_mask is not None:
+                extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+                extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
+                scores = scores + extended_attention_mask
+
+            scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+            scores = self.attn_dropout(scores)
+            output = scores @ xv
+
+        output = output.transpose(1, 2).reshape(bsz, seq_len, -1)
+        output = self.resid_dropout(self.o_proj(output))
+        return output, past_kv
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config: MiniMindConfig):
+        super().__init__()
+        if config.intermediate_size is None:
+            intermediate_size = int(config.hidden_size * 8 / 3)
+            config.intermediate_size = 64 * ((intermediate_size + 64 - 1) // 64)
+        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.dropout(self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)))
+
+
+class MoEGate(nn.Module):
+    def __init__(self, config: MiniMindConfig):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+
+        self.scoring_func = config.scoring_func
+        self.alpha = config.aux_loss_alpha
+        self.seq_aux = config.seq_aux
+
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(hidden_states, self.weight, None)
+        if self.scoring_func == 'softmax':
+            scores = logits.softmax(dim=-1)
+        else:
+            raise NotImplementedError(f'insupportable scoring function for MoE gating: {self.scoring_func}')
+
+        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+
+        if self.training and self.alpha > 0.0:
+            scores_for_aux = scores
+            aux_topk = self.top_k
+            topk_idx_for_aux_loss = topk_idx.view(bsz, -1)
+            if self.seq_aux:
+                scores_for_seq_aux = scores_for_aux.view(bsz, seq_len, -1)
+                ce = torch.zeros(bsz, self.n_routed_experts, device=hidden_states.device)
+                ce.scatter_add_(1, topk_idx_for_aux_loss,
+                                torch.ones(bsz, seq_len * aux_topk, device=hidden_states.device)).div_(
+                    seq_len * aux_topk / self.n_routed_experts)
+                aux_loss = (ce * scores_for_seq_aux.mean(dim=1)).sum(dim=1).mean() * self.alpha
+            else:
+                mask_ce = F.one_hot(topk_idx_for_aux_loss.view(-1), num_classes=self.n_routed_experts)
+                ce = mask_ce.float().mean(0)
+                Pi = scores_for_aux.mean(0)
+                fi = ce * self.n_routed_experts
+                aux_loss = (Pi * fi).sum() * self.alpha
+        else:
+            aux_loss = scores.new_zeros(1).squeeze()
+        return topk_idx, topk_weight, aux_loss
+
+
+class MOEFeedForward(nn.Module):
+    def __init__(self, config: MiniMindConfig):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList([
+            FeedForward(config)
+            for _ in range(config.n_routed_experts)
+        ])
+        self.gate = MoEGate(config)
+        if config.n_shared_experts > 0:
+            self.shared_experts = nn.ModuleList([
+                FeedForward(config)
+                for _ in range(config.n_shared_experts)
+            ])
+
+    def forward(self, x):
+        identity = x
+        orig_shape = x.shape
+        bsz, seq_len, _ = x.shape
+        # 使用门控机制选择专家
+        topk_idx, topk_weight, aux_loss = self.gate(x)
+        x = x.view(-1, x.shape[-1])
+        flat_topk_idx = topk_idx.view(-1)
+        if self.training:
+            x = x.repeat_interleave(self.config.num_experts_per_tok, dim=0)
+            y = torch.empty_like(x, dtype=x.dtype)
+            for i, expert in enumerate(self.experts):
+                expert_out = expert(x[flat_topk_idx == i])
+                if expert_out.shape[0] > 0: y[flat_topk_idx == i] = expert_out.to(y.dtype)
+                else: y[flat_topk_idx == i] = expert_out.to(y.dtype) + 0 * sum(p.sum() for p in expert.parameters())
+            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+            y = y.view(*orig_shape)
+        else:
+            y = self.moe_infer(x, flat_topk_idx, topk_weight.view(-1, 1)).view(*orig_shape)
+        if self.config.n_shared_experts > 0:
+            for expert in self.shared_experts:
+                y = y + expert(identity)
+        self.aux_loss = aux_loss
+        return y
+
+    @torch.no_grad()
+    def moe_infer(self, x, flat_expert_indices, flat_expert_weights):
+        expert_cache = torch.zeros_like(x)
+        idxs = flat_expert_indices.argsort()
+        tokens_per_expert = flat_expert_indices.bincount().cpu().numpy().cumsum(0)
+        token_idxs = idxs // self.config.num_experts_per_tok
+        # 当tokens_per_expert = [6, 15, 20, 26]，tokens_per_expert.shape[0]即为专家数量（此时为4）
+        # 且token_idxs = [3, 7, 19, 21, 24, 25,  4,  5,  6, 10, 11, 12...] 时
+        # 意味token_idxs[:6] -> [3, 7, 19, 21, 24, 25]这6个位置属于专家0处理的token（每个token有可能被多个专家处理，这取决于num_experts_per_tok）
+        # 接下来9个位置token_idxs[6:15] -> [4,  5,  6, 10, 11, 12...]属于专家1处理的token...依此类推
+        for i, end_idx in enumerate(tokens_per_expert):
+            start_idx = 0 if i == 0 else tokens_per_expert[i - 1]
+            if start_idx == end_idx:
+                continue
+            expert = self.experts[i]
+            exp_token_idx = token_idxs[start_idx:end_idx]
+            expert_tokens = x[exp_token_idx]
+            expert_out = expert(expert_tokens).to(expert_cache.dtype)
+            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
+            expert_cache.scatter_add_(0, exp_token_idx.view(-1, 1).repeat(1, x.shape[-1]), expert_out)
+
+        return expert_cache
+
+
+class MiniMindBlock(nn.Module):
+    def __init__(self, layer_id: int, config: MiniMindConfig):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.self_attn = Attention(config)
+
+        self.layer_id = layer_id
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = FeedForward(config) if not config.use_moe else MOEFeedForward(config)
+
+    def forward(self, hidden_states, position_embeddings, past_key_value=None, use_cache=False, attention_mask=None):
+        residual = hidden_states
+        hidden_states, present_key_value = self.self_attn(
+            self.input_layernorm(hidden_states), position_embeddings,
+            past_key_value, use_cache, attention_mask
+        )
+        hidden_states += residual
+        hidden_states = hidden_states + self.mlp(self.post_attention_layernorm(hidden_states))
+        return hidden_states, present_key_value
+
+
+class MiniMindModel(nn.Module):
+    def __init__(self, config: MiniMindConfig):
+        super().__init__()
+        self.config = config
+        self.vocab_size, self.num_hidden_layers = config.vocab_size, config.num_hidden_layers
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.dropout)
+        self.layers = nn.ModuleList([MiniMindBlock(l, config) for l in range(self.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        freqs_cos, freqs_sin = precompute_freqs_cis(dim=config.hidden_size // config.num_attention_heads,
+                                                    end=config.max_position_embeddings, rope_base=config.rope_theta,
+                                                    rope_scaling=config.rope_scaling)
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+
+    def forward(self,
+                input_ids: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+                use_cache: bool = False,
+                **kwargs):
+        batch_size, seq_length = input_ids.shape
+        if hasattr(past_key_values, 'layers'): past_key_values = None
+        past_key_values = past_key_values or [None] * len(self.layers)
+        start_pos = past_key_values[0][0].shape[1] if past_key_values[0] is not None else 0
+
+        hidden_states = self.dropout(self.embed_tokens(input_ids))
+
+        position_embeddings = (
+            self.freqs_cos[start_pos:start_pos + seq_length],
+            self.freqs_sin[start_pos:start_pos + seq_length]
+        )
+
+        presents = []
+        for layer_idx, (layer, past_key_value) in enumerate(zip(self.layers, past_key_values)):
+            hidden_states, present = layer(
+                hidden_states,
+                position_embeddings,
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+                attention_mask=attention_mask
+            )
+            presents.append(present)
+
+        hidden_states = self.norm(hidden_states)
+
+        aux_loss = sum([l.mlp.aux_loss for l in self.layers if isinstance(l.mlp, MOEFeedForward)], hidden_states.new_zeros(1).squeeze())
+        return hidden_states, presents, aux_loss
+
+
+class MiniMindForCausalLM(PreTrainedModel, GenerationMixin):
+    config_class = MiniMindConfig
+
+    def __init__(self, config: MiniMindConfig = None):
+        self.config = config or MiniMindConfig()
+        super().__init__(self.config)
+        self.model = MiniMindModel(self.config)
+        self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)
+        self.model.embed_tokens.weight = self.lm_head.weight
+
+    def forward(self,
+                input_ids: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                labels: Optional[torch.Tensor] = None,
+                past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+                use_cache: bool = False,
+                logits_to_keep: Union[int, torch.Tensor] = 0,
+                **args):
+        hidden_states, past_key_values, aux_loss = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            **args
+        )
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), ignore_index=-100)
+
+        output = CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=past_key_values, hidden_states=hidden_states)
+        output.aux_loss = aux_loss
+        return output
+
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/os/__init__.py b/toolbox/os/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bc5155c67cae42f80e8126d1727b0edc1e02398
--- /dev/null
+++ b/toolbox/os/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/os/command.py b/toolbox/os/command.py
new file mode 100644
index 0000000000000000000000000000000000000000..40a1880fb08d21be79f31a13e51e389adb18c283
--- /dev/null
+++ b/toolbox/os/command.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import os
+
+
+class Command(object):
+    custom_command = [
+        "cd"
+    ]
+
+    @staticmethod
+    def _get_cmd(command):
+        command = str(command).strip()
+        if command == "":
+            return None
+        cmd_and_args = command.split(sep=" ")
+        cmd = cmd_and_args[0]
+        args = " ".join(cmd_and_args[1:])
+        return cmd, args
+
+    @classmethod
+    def popen(cls, command):
+        cmd, args = cls._get_cmd(command)
+        if cmd in cls.custom_command:
+            method = getattr(cls, cmd)
+            return method(args)
+        else:
+            resp = os.popen(command)
+            result = resp.read()
+            resp.close()
+            return result
+
+    @classmethod
+    def cd(cls, args):
+        if args.startswith("/"):
+            os.chdir(args)
+        else:
+            pwd = os.getcwd()
+            path = os.path.join(pwd, args)
+            os.chdir(path)
+
+    @classmethod
+    def system(cls, command):
+        return os.system(command)
+
+    def __init__(self):
+        pass
+
+
+def ps_ef_grep(keyword: str):
+    cmd = "ps -ef | grep {}".format(keyword)
+    rows = Command.popen(cmd)
+    rows = str(rows).split("\n")
+    rows = [row for row in rows if row.__contains__(keyword) and not row.__contains__("grep")]
+    return rows
+
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/os/environment.py b/toolbox/os/environment.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a9bc004fc8777ae906c1caccd47bb257b126d55
--- /dev/null
+++ b/toolbox/os/environment.py
@@ -0,0 +1,114 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import json
+import os
+
+from dotenv import load_dotenv
+from dotenv.main import DotEnv
+
+from toolbox.json.misc import traverse
+
+
+class EnvironmentManager(object):
+    def __init__(self, path, env, override=False):
+        filename = os.path.join(path, '{}.env'.format(env))
+        self.filename = filename
+
+        load_dotenv(
+            dotenv_path=filename,
+            override=override
+        )
+
+        self._environ = dict()
+
+    def open_dotenv(self, filename: str = None):
+        filename = filename or self.filename
+        dotenv = DotEnv(
+            dotenv_path=filename,
+            stream=None,
+            verbose=False,
+            interpolate=False,
+            override=False,
+            encoding="utf-8",
+        )
+        result = dotenv.dict()
+        return result
+
+    def get(self, key, default=None, dtype=str):
+        result = os.environ.get(key)
+        if result is None:
+            if default is None:
+                result = None
+            else:
+                result = default
+        else:
+            result = dtype(result)
+        self._environ[key] = result
+        return result
+
+
+_DEFAULT_DTYPE_MAP = {
+    'int': int,
+    'float': float,
+    'str': str,
+    'json.loads': json.loads
+}
+
+
+class JsonConfig(object):
+    """
+    将 json 中, 形如 `$float:threshold` 的值, 处理为:
+    从环境变量中查到 threshold, 再将其转换为 float 类型.
+    """
+    def __init__(self, dtype_map: dict = None, environment: EnvironmentManager = None):
+        self.dtype_map = dtype_map or _DEFAULT_DTYPE_MAP
+        self.environment = environment or os.environ
+
+    def sanitize_by_filename(self, filename: str):
+        with open(filename, 'r', encoding='utf-8') as f:
+            js = json.load(f)
+
+        return self.sanitize_by_json(js)
+
+    def sanitize_by_json(self, js):
+        js = traverse(
+            js,
+            callback=self.sanitize,
+            environment=self.environment
+        )
+        return js
+
+    def sanitize(self, string, environment):
+        """支持 $ 符开始的, 环境变量配置"""
+        if isinstance(string, str) and string.startswith('$'):
+            dtype, key = string[1:].split(':')
+            dtype = self.dtype_map[dtype]
+
+            value = environment.get(key)
+            if value is None:
+                raise AssertionError('environment not exist. key: {}'.format(key))
+
+            value = dtype(value)
+            result = value
+        else:
+            result = string
+        return result
+
+
+def demo1():
+    import json
+
+    from project_settings import project_path
+
+    environment = EnvironmentManager(
+        path=os.path.join(project_path, 'server/callbot_server/dotenv'),
+        env='dev',
+    )
+    init_scenes = environment.get(key='init_scenes', dtype=json.loads)
+    print(init_scenes)
+    print(environment._environ)
+    return
+
+
+if __name__ == '__main__':
+    demo1()
diff --git a/toolbox/os/other.py b/toolbox/os/other.py
new file mode 100644
index 0000000000000000000000000000000000000000..f215505eedfd714442d2fab241c8b1aff871d18a
--- /dev/null
+++ b/toolbox/os/other.py
@@ -0,0 +1,9 @@
+import os
+import inspect
+
+
+def pwd():
+    """你在哪个文件调用此函数, 它就会返回那个文件所在的 dir 目标"""
+    frame = inspect.stack()[1]
+    module = inspect.getmodule(frame[0])
+    return os.path.dirname(os.path.abspath(module.__file__))
diff --git a/toolbox/torch/__init__.py b/toolbox/torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aad738e112896111c38ae6624c8632aee62a234
--- /dev/null
+++ b/toolbox/torch/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torch/modules/__init__.py b/toolbox/torch/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bc5155c67cae42f80e8126d1727b0edc1e02398
--- /dev/null
+++ b/toolbox/torch/modules/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torch/modules/gaussian_mixture.py b/toolbox/torch/modules/gaussian_mixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..d32af30380470ef2dcfeebddae61e83ded225034
--- /dev/null
+++ b/toolbox/torch/modules/gaussian_mixture.py
@@ -0,0 +1,173 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/georgepar/gmmhmm-pytorch/blob/master/gmm.py
+https://github.com/ldeecke/gmm-torch
+"""
+import math
+
+from sklearn import cluster
+import torch
+import torch.nn as nn
+
+
+class GaussianMixtureModel(nn.Module):
+    def __init__(self,
+                 n_mixtures: int,
+                 n_features: int,
+                 init: str = "random",
+                 device: str = 'cpu',
+                 n_iter: int = 1000,
+                 delta: float = 1e-3,
+                 warm_start: bool = False,
+                 ):
+        super(GaussianMixtureModel, self).__init__()
+        self.n_mixtures = n_mixtures
+        self.n_features = n_features
+        self.init = init
+        self.device = device
+        self.n_iter = n_iter
+        self.delta = delta
+        self.warm_start = warm_start
+
+        if init not in ('kmeans', 'random'):
+            raise AssertionError
+
+        self.mu = nn.Parameter(
+            torch.Tensor(n_mixtures, n_features),
+            requires_grad=False,
+        )
+
+        self.sigma = None
+
+        # the weight of each gaussian
+        self.pi = nn.Parameter(
+            torch.Tensor(n_mixtures),
+            requires_grad=False
+        )
+
+        self.converged_ = False
+        self.eps = 1e-6
+        self.delta = delta
+        self.warm_start = warm_start
+        self.n_iter = n_iter
+
+    def reset_sigma(self):
+        raise NotImplementedError
+
+    def estimate_precisions(self):
+        raise NotImplementedError
+
+    def log_prob(self, x):
+        raise NotImplementedError
+
+    def weighted_log_prob(self, x):
+        log_prob = self.log_prob(x)
+        weighted_log_prob = log_prob + torch.log(self.pi)
+        return weighted_log_prob
+
+    def log_likelihood(self, x):
+        weighted_log_prob = self.weighted_log_prob(x)
+        per_sample_log_likelihood = torch.logsumexp(weighted_log_prob, dim=1)
+        log_likelihood = torch.sum(per_sample_log_likelihood)
+        return log_likelihood
+
+    def e_step(self, x):
+        weighted_log_prob = self.weighted_log_prob(x)
+        weighted_log_prob = weighted_log_prob.unsqueeze(dim=-1)
+        log_likelihood = torch.logsumexp(weighted_log_prob, dim=1, keepdim=True)
+        q = weighted_log_prob - log_likelihood
+        return q.squeeze()
+
+    def m_step(self, x, q):
+        x = x.unsqueeze(dim=1)
+        
+        return
+
+    def estimate_mu(self, x, pi, responsibilities):
+        nk = pi * x.size(0)
+        mu = torch.sum(responsibilities * x, dim=0, keepdim=True) / nk
+        return mu
+
+    def estimate_pi(self, x, responsibilities):
+        pi = torch.sum(responsibilities, dim=0, keepdim=True) + self.eps
+        pi = pi / x.size(0)
+        return pi
+
+    def reset_parameters(self, x=None):
+        if self.init == 'random' or x is None:
+            self.mu.normal_()
+            self.reset_sigma()
+            self.pi.fill_(1.0 / self.n_mixtures)
+        elif self.init == 'kmeans':
+            centroids = cluster.KMeans(n_clusters=self.n_mixtures, n_init=1).fit(x).cluster_centers_
+            centroids = torch.tensor(centroids).to(self.device)
+            self.update_(mu=centroids)
+        else:
+            raise NotImplementedError
+
+
+class DiagonalCovarianceGMM(GaussianMixtureModel):
+    def __init__(self,
+                 n_mixtures: int,
+                 n_features: int,
+                 init: str = "random",
+                 device: str = 'cpu',
+                 n_iter: int = 1000,
+                 delta: float = 1e-3,
+                 warm_start: bool = False,
+                 ):
+        super(DiagonalCovarianceGMM, self).__init__(
+            n_mixtures=n_mixtures,
+            n_features=n_features,
+            init=init,
+            device=device,
+            n_iter=n_iter,
+            delta=delta,
+            warm_start=warm_start,
+        )
+        self.sigma = nn.Parameter(
+            torch.Tensor(n_mixtures, n_features), requires_grad=False
+        )
+        self.reset_parameters()
+        self.to(self.device)
+
+    def reset_sigma(self):
+        self.sigma.fill_(1)
+
+    def estimate_precisions(self):
+        return torch.rsqrt(self.sigma)
+
+    def log_prob(self, x):
+        precisions = self.estimate_precisions()
+
+        x = x.unsqueeze(1)
+        mu = self.mu.unsqueeze(0)
+        precisions = precisions.unsqueeze(0)
+
+        # This is outer product
+        exp_term = torch.sum(
+            (mu * mu + x * x - 2 * x * mu) * (precisions ** 2), dim=2, keepdim=True
+        )
+        log_det = torch.sum(torch.log(precisions), dim=2, keepdim=True)
+
+        logp = -0.5 * (self.n_features * torch.log(2 * math.pi) + exp_term) + log_det
+
+        return logp.squeeze()
+
+    def estimate_sigma(self, x, mu, pi, responsibilities):
+        nk = pi * x.size(0)
+        x2 = (responsibilities * x * x).sum(0, keepdim=True) / nk
+        mu2 = mu * mu
+        xmu = (responsibilities * mu * x).sum(0, keepdim=True) / nk
+        sigma = x2 - 2 * xmu + mu2 + self.eps
+
+        return sigma
+
+
+def demo1():
+    return
+
+
+if __name__ == '__main__':
+    demo1()
diff --git a/toolbox/torch/modules/highway.py b/toolbox/torch/modules/highway.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc43ce44301112a97c15f96431ca2ee63a270688
--- /dev/null
+++ b/toolbox/torch/modules/highway.py
@@ -0,0 +1,30 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import torch.nn as nn
+
+
+class Highway(nn.Module):
+    """
+    https://arxiv.org/abs/1505.00387
+    [Submitted on 3 May 2015 (v1), last revised 3 Nov 2015 (this version, v2)]
+
+    discuss of Highway and ResNet
+    https://www.zhihu.com/question/279426970
+    """
+    def __init__(self, in_size, out_size):
+        super(Highway, self).__init__()
+        self.H = nn.Linear(in_size, out_size)
+        self.H.bias.data.zero_()
+        self.T = nn.Linear(in_size, out_size)
+        self.T.bias.data.fill_(-1)
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, inputs):
+        H = self.relu(self.H(inputs))
+        T = self.sigmoid(self.T(inputs))
+        return H * T + inputs * (1.0 - T)
+
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torch/modules/loss.py b/toolbox/torch/modules/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff8f5a6b0fee97d7977b5606328e706eb9471b82
--- /dev/null
+++ b/toolbox/torch/modules/loss.py
@@ -0,0 +1,738 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import math
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.loss import _Loss
+from torch.autograd import Variable
+
+
+class ClassBalancedLoss(_Loss):
+    """
+    https://arxiv.org/abs/1901.05555
+    """
+    @staticmethod
+    def demo1():
+        batch_loss: torch.FloatTensor = torch.randn(size=(2, 1), dtype=torch.float32)
+        targets: torch.LongTensor = torch.tensor([1, 2], dtype=torch.long)
+
+        class_balanced_loss = ClassBalancedLoss(
+            num_classes=3,
+            num_samples_each_class=[300, 433, 50],
+            reduction='mean',
+        )
+        loss = class_balanced_loss.forward(batch_loss=batch_loss, targets=targets)
+        print(loss)
+        return
+
+    @staticmethod
+    def demo2():
+        inputs: torch.FloatTensor = torch.randn(size=(2, 3), dtype=torch.float32)
+        targets: torch.LongTensor = torch.tensor([1, 2], dtype=torch.long)
+
+        focal_loss = FocalLoss(
+            num_classes=3,
+            # reduction='mean',
+            # reduction='sum',
+            reduction='none',
+        )
+        batch_loss = focal_loss.forward(inputs, targets)
+        print(batch_loss)
+
+        class_balanced_loss = ClassBalancedLoss(
+            num_classes=3,
+            num_samples_each_class=[300, 433, 50],
+            reduction='mean',
+        )
+        loss = class_balanced_loss.forward(batch_loss=batch_loss, targets=targets)
+        print(loss)
+
+        return
+
+    def __init__(self,
+                 num_classes: int,
+                 num_samples_each_class: List[int],
+                 beta: float = 0.999,
+                 reduction: str = 'mean') -> None:
+        super(ClassBalancedLoss, self).__init__(None, None, reduction)
+
+        effective_num = 1.0 - np.power(beta, num_samples_each_class)
+        weights = (1.0 - beta) / np.array(effective_num)
+        self.weights = weights / np.sum(weights) * num_classes
+
+    def forward(self, batch_loss: torch.FloatTensor, targets: torch.LongTensor):
+        """
+        :param batch_loss: shape=[batch_size, 1]
+        :param targets: shape=[batch_size,]
+        :return:
+        """
+        weights = list()
+        targets = targets.numpy()
+        for target in targets:
+            weights.append([self.weights[target]])
+
+        weights = torch.tensor(weights, dtype=torch.float32)
+        batch_loss = weights * batch_loss
+
+        if self.reduction == 'mean':
+            loss = batch_loss.mean()
+        elif self.reduction == 'sum':
+            loss = batch_loss.sum()
+        else:
+            loss = batch_loss
+        return loss
+
+
+class EqualizationLoss(_Loss):
+    """
+    在图像识别中的, sigmoid 的多标签分类, 且 num_classes 类别数之外有一个 background 背景类别.
+    Equalization Loss
+    https://arxiv.org/abs/2003.05176
+    Equalization Loss v2
+    https://arxiv.org/abs/2012.08548
+    """
+
+    @staticmethod
+    def demo1():
+        logits: torch.FloatTensor = torch.randn(size=(3, 3), dtype=torch.float32)
+        targets: torch.LongTensor = torch.tensor([1, 2, 3], dtype=torch.long)
+
+        equalization_loss = EqualizationLoss(
+            num_samples_each_class=[300, 433, 50],
+            threshold=100,
+            reduction='mean',
+        )
+        loss = equalization_loss.forward(logits=logits, targets=targets)
+        print(loss)
+        return
+
+    def __init__(self,
+                 num_samples_each_class: List[int],
+                 threshold: int = 100,
+                 reduction: str = 'mean') -> None:
+        super(EqualizationLoss, self).__init__(None, None, reduction)
+        self.num_samples_each_class = np.array(num_samples_each_class, dtype=np.int32)
+        self.threshold = threshold
+
+    def forward(self,
+                logits: torch.FloatTensor,
+                targets: torch.LongTensor
+                ):
+        """
+        num_classes + 1 对应于背景类别 background.
+        :param logits: shape=[batch_size, num_classes]
+        :param targets: shape=[batch_size]
+        :return:
+        """
+        batch_size, num_classes = logits.size()
+
+        one_hot_targets = F.one_hot(targets, num_classes=num_classes + 1)
+        one_hot_targets = one_hot_targets[:, :-1]
+
+        exclude = self.exclude_func(
+            num_classes=num_classes,
+            targets=targets
+        )
+        is_tail = self.threshold_func(
+            num_classes=num_classes,
+            num_samples_each_class=self.num_samples_each_class,
+            threshold=self.threshold,
+        )
+
+        weights = 1 - exclude * is_tail * (1 - one_hot_targets)
+
+        batch_loss = F.binary_cross_entropy_with_logits(
+            logits,
+            one_hot_targets.float(),
+            reduction='none'
+        )
+
+        batch_loss = weights * batch_loss
+
+        if self.reduction == 'mean':
+            loss = batch_loss.mean()
+        elif self.reduction == 'sum':
+            loss = batch_loss.sum()
+        else:
+            loss = batch_loss
+
+        loss = loss / num_classes
+        return loss
+
+    @staticmethod
+    def exclude_func(num_classes: int, targets: torch.LongTensor):
+        """
+        最后一个类别是背景 background.
+        :param num_classes: int,
+        :param targets: shape=[batch_size,]
+        :return: weight, shape=[batch_size, num_classes]
+        """
+        batch_size = targets.shape[0]
+        weight = (targets != num_classes).float()
+        weight = weight.view(batch_size, 1).expand(batch_size, num_classes)
+        return weight
+
+    @staticmethod
+    def threshold_func(num_classes: int, num_samples_each_class: np.ndarray, threshold: int):
+        """
+        :param num_classes: int,
+        :param num_samples_each_class: shape=[num_classes]
+        :param threshold: int,
+        :return: weight, shape=[1, num_classes]
+        """
+        weight = torch.zeros(size=(num_classes,))
+        weight[num_samples_each_class < threshold] = 1
+        weight = torch.unsqueeze(weight, dim=0)
+        return weight
+
+
+class FocalLoss(_Loss):
+    """
+    https://arxiv.org/abs/1708.02002
+    """
+    @staticmethod
+    def demo1(self):
+        inputs: torch.FloatTensor = torch.randn(size=(2, 3), dtype=torch.float32)
+        targets: torch.LongTensor = torch.tensor([1, 2], dtype=torch.long)
+
+        focal_loss = FocalLoss(
+            num_classes=3,
+            reduction='mean',
+            # reduction='sum',
+            # reduction='none',
+        )
+        loss = focal_loss.forward(inputs, targets)
+        print(loss)
+        return
+
+    def __init__(self,
+                 num_classes: int,
+                 alpha: List[float] = None,
+                 gamma: int = 2,
+                 reduction: str = 'mean',
+                 inputs_logits: bool = True) -> None:
+        """
+        :param num_classes:
+        :param alpha:
+        :param gamma:
+        :param reduction: (`none`, `mean`, `sum`) available.
+        :param inputs_logits: if False, the inputs should be probs.
+        """
+        super(FocalLoss, self).__init__(None, None, reduction)
+        if alpha is None:
+            self.alpha = torch.ones(num_classes, 1)
+        else:
+            self.alpha = torch.tensor(alpha, dtype=torch.float32)
+        self.gamma = gamma
+        self.num_classes = num_classes
+        self.inputs_logits = inputs_logits
+
+    def forward(self,
+                inputs: torch.FloatTensor,
+                targets: torch.LongTensor):
+        """
+        :param inputs: logits, shape=[batch_size, num_classes]
+        :param targets: shape=[batch_size,]
+        :return:
+        """
+        batch_size, num_classes = inputs.shape
+
+        if self.inputs_logits:
+            probs = F.softmax(inputs, dim=-1)
+        else:
+            probs = inputs
+
+        # class_mask = inputs.data.new(batch_size, num_classes).fill_(0)
+        class_mask = torch.zeros(size=(batch_size, num_classes), dtype=inputs.dtype, device=inputs.device)
+        # class_mask = Variable(class_mask)
+        ids = targets.view(-1, 1)
+        class_mask.scatter_(1, ids.data, 1.)
+
+        if inputs.is_cuda and not self.alpha.is_cuda:
+            self.alpha = self.alpha.cuda()
+        alpha = self.alpha[ids.data.view(-1)]
+
+        probs = (probs * class_mask).sum(1).view(-1, 1)
+
+        log_p = probs.log()
+
+        batch_loss = -alpha*(torch.pow((1-probs), self.gamma))*log_p
+
+        if self.reduction == 'mean':
+            loss = batch_loss.mean()
+        elif self.reduction == 'sum':
+            loss = batch_loss.sum()
+        else:
+            loss = batch_loss
+        return loss
+
+
+class HingeLoss(_Loss):
+    @staticmethod
+    def demo1():
+        inputs: torch.FloatTensor = torch.randn(size=(2, 3), dtype=torch.float32)
+        targets: torch.LongTensor = torch.tensor([1, 2], dtype=torch.long)
+
+        hinge_loss = HingeLoss(
+            margin_list=[300, 433, 50],
+            reduction='mean',
+        )
+        loss = hinge_loss.forward(inputs=inputs, targets=targets)
+        print(loss)
+        return
+
+    def __init__(self,
+                 margin_list: List[float],
+                 max_margin: float = 0.5,
+                 scale: float = 1.0,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean') -> None:
+        super(HingeLoss, self).__init__(None, None, reduction)
+
+        self.max_margin = max_margin
+        self.scale = scale
+        self.weight = weight
+
+        margin_list = np.array(margin_list)
+        margin_list = margin_list * (max_margin / np.max(margin_list))
+        self.margin_list = torch.tensor(margin_list, dtype=torch.float32)
+
+    def forward(self,
+                inputs: torch.FloatTensor,
+                targets: torch.LongTensor
+                ):
+        """
+        :param inputs: logits, shape=[batch_size, num_classes]
+        :param targets: shape=[batch_size,]
+        :return:
+        """
+        batch_size, num_classes = inputs.shape
+        one_hot_targets = F.one_hot(targets, num_classes=num_classes)
+        margin_list = torch.unsqueeze(self.margin_list, dim=0)
+
+        batch_margin = torch.sum(margin_list * one_hot_targets, dim=-1)
+        batch_margin = torch.unsqueeze(batch_margin, dim=-1)
+        inputs_margin = inputs - batch_margin
+
+        # 将类别对应的 logits 值减小一点, 以形成 margin 边界.
+        logits = torch.where(one_hot_targets > 0, inputs_margin, inputs)
+
+        loss = F.cross_entropy(
+            input=self.scale * logits,
+            target=targets,
+            weight=self.weight,
+            reduction=self.reduction,
+        )
+        return loss
+
+
+class HingeLinear(nn.Module):
+    """
+    use this instead of `HingeLoss`, then you can combine it with `FocalLoss` or others.
+    """
+    def __init__(self,
+                 margin_list: List[float],
+                 max_margin: float = 0.5,
+                 scale: float = 1.0,
+                 weight: Optional[torch.Tensor] = None
+                 ) -> None:
+        super(HingeLinear, self).__init__()
+
+        self.max_margin = max_margin
+        self.scale = scale
+        self.weight = weight
+
+        margin_list = np.array(margin_list)
+        margin_list = margin_list * (max_margin / np.max(margin_list))
+        self.margin_list = torch.tensor(margin_list, dtype=torch.float32)
+
+    def forward(self,
+                inputs: torch.FloatTensor,
+                targets: torch.LongTensor
+                ):
+        """
+        :param inputs: logits, shape=[batch_size, num_classes]
+        :param targets: shape=[batch_size,]
+        :return:
+        """
+        if self.training and targets is not None:
+            batch_size, num_classes = inputs.shape
+            one_hot_targets = F.one_hot(targets, num_classes=num_classes)
+            margin_list = torch.unsqueeze(self.margin_list, dim=0)
+
+            batch_margin = torch.sum(margin_list * one_hot_targets, dim=-1)
+            batch_margin = torch.unsqueeze(batch_margin, dim=-1)
+            inputs_margin = inputs - batch_margin
+
+            # 将类别对应的 logits 值减小一点, 以形成 margin 边界.
+            logits = torch.where(one_hot_targets > 0, inputs_margin, inputs)
+            logits = logits * self.scale
+        else:
+            logits = inputs
+        return logits
+
+
+class LDAMLoss(_Loss):
+    """
+    https://arxiv.org/abs/1906.07413
+    """
+    @staticmethod
+    def demo1():
+        inputs: torch.FloatTensor = torch.randn(size=(2, 3), dtype=torch.float32)
+        targets: torch.LongTensor = torch.tensor([1, 2], dtype=torch.long)
+
+        ldam_loss = LDAMLoss(
+            num_samples_each_class=[300, 433, 50],
+            reduction='mean',
+        )
+        loss = ldam_loss.forward(inputs=inputs, targets=targets)
+        print(loss)
+        return
+
+    def __init__(self,
+                 num_samples_each_class: List[int],
+                 max_margin: float = 0.5,
+                 scale: float = 30.0,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean') -> None:
+        super(LDAMLoss, self).__init__(None, None, reduction)
+
+        margin_list = np.power(num_samples_each_class, -0.25)
+        margin_list = margin_list * (max_margin / np.max(margin_list))
+
+        self.num_samples_each_class = num_samples_each_class
+        self.margin_list = torch.tensor(margin_list, dtype=torch.float32)
+        self.scale = scale
+        self.weight = weight
+
+    def forward(self,
+                inputs: torch.FloatTensor,
+                targets: torch.LongTensor
+                ):
+        """
+        :param inputs: logits, shape=[batch_size, num_classes]
+        :param targets: shape=[batch_size,]
+        :return:
+        """
+        batch_size, num_classes = inputs.shape
+        one_hot_targets = F.one_hot(targets, num_classes=num_classes)
+        margin_list = torch.unsqueeze(self.margin_list, dim=0)
+
+        batch_margin = torch.sum(margin_list * one_hot_targets, dim=-1)
+        batch_margin = torch.unsqueeze(batch_margin, dim=-1)
+        inputs_margin = inputs - batch_margin
+
+        # 将类别对应的 logits 值减小一点, 以形成 margin 边界.
+        logits = torch.where(one_hot_targets > 0, inputs_margin, inputs)
+
+        loss = F.cross_entropy(
+            input=self.scale * logits,
+            target=targets,
+            weight=self.weight,
+            reduction=self.reduction,
+        )
+        return loss
+
+
+class NegativeEntropy(_Loss):
+    def __init__(self,
+                 reduction: str = 'mean',
+                 inputs_logits: bool = True) -> None:
+        super(NegativeEntropy, self).__init__(None, None, reduction)
+        self.inputs_logits = inputs_logits
+
+    def forward(self,
+                inputs: torch.FloatTensor,
+                targets: torch.LongTensor):
+        if self.inputs_logits:
+            probs = F.softmax(inputs, dim=-1)
+            log_probs = torch.nn.functional.log_softmax(probs, dim=-1)
+        else:
+            probs = inputs
+            log_probs = torch.log(probs)
+
+        weighted_negative_likelihood = - log_probs * probs
+
+        loss = - weighted_negative_likelihood.sum()
+        return loss
+
+
+class LargeMarginSoftMaxLoss(_Loss):
+    """
+    Alias: L-Softmax
+
+    https://arxiv.org/abs/1612.02295
+    https://github.com/wy1iu/LargeMargin_Softmax_Loss
+    https://github.com/amirhfarzaneh/lsoftmax-pytorch/blob/master/lsoftmax.py
+
+    参考链接:
+    https://www.jianshu.com/p/06cc3f84aa85
+
+    论文认为, softmax 和 cross entropy 的组合, 没有明确鼓励对特征进行判别学习.
+
+    """
+    def __init__(self,
+                 reduction: str = 'mean') -> None:
+        super(LargeMarginSoftMaxLoss, self).__init__(None, None, reduction)
+
+
+class AngularSoftMaxLoss(_Loss):
+    """
+    Alias: A-Softmax
+
+    https://arxiv.org/abs/1704.08063
+
+    https://github.com/woshildh/a-softmax_pytorch/blob/master/a_softmax.py
+
+    参考链接:
+    https://www.jianshu.com/p/06cc3f84aa85
+
+    好像作者认为人脸是一个球面, 所以将向量转换到一个球面上是有帮助的.
+    """
+    def __init__(self,
+                 reduction: str = 'mean') -> None:
+        super(AngularSoftMaxLoss, self).__init__(None, None, reduction)
+
+
+class AdditiveMarginSoftMax(_Loss):
+    """
+    Alias: AM-Softmax
+
+    https://arxiv.org/abs/1801.05599
+
+    Large Margin Cosine Loss
+    https://arxiv.org/abs/1801.09414
+
+    参考链接:
+    https://www.jianshu.com/p/06cc3f84aa85
+
+    说明:
+    相对于普通的 对 logits 做 softmax,
+    它将真实标签对应的 logit 值减去 m, 来让模型它该值调整得更大一些.
+    另外, 它还将每个 logits 乘以 s, 这可以控制各 logits 之间的相对大小.
+    根 HingeLoss 有点像.
+    """
+    def __init__(self,
+                 reduction: str = 'mean') -> None:
+        super(AdditiveMarginSoftMax, self).__init__(None, None, reduction)
+
+
+class AdditiveAngularMarginSoftMax(_Loss):
+    """
+    Alias: ArcFace, AAM-Softmax
+
+    ArcFace: Additive Angular Margin Loss for Deep Face Recognition
+    https://arxiv.org/abs/1801.07698
+
+    参考代码:
+    https://github.com/huangkeju/AAMSoftmax-OpenMax/blob/main/AAMSoftmax%2BOvA/metrics.py
+
+    """
+    @staticmethod
+    def demo1():
+        """
+        角度与数值转换
+        pi / 180 代表 1 度,
+        pi / 180 = 0.01745
+        """
+
+        # 度数转数值
+        degree = 10
+        result = degree * math.pi / 180
+        print(result)
+
+        # 数值转数度
+        radian = 0.2
+        result = radian / (math.pi / 180)
+        print(result)
+
+        return
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_labels: int,
+                 margin: float = 0.2,
+                 scale: float = 10.0,
+                 ):
+        """
+        :param hidden_size:
+        :param num_labels:
+        :param margin: 建议取值角度为 [10, 30], 对应的数值为 [0.1745, 0.5236]
+        :param scale:
+        """
+        super(AdditiveAngularMarginSoftMax, self).__init__()
+        self.margin = margin
+        self.scale = scale
+        self.weight = torch.nn.Parameter(torch.FloatTensor(num_labels, hidden_size), requires_grad=True)
+        nn.init.xavier_uniform_(self.weight)
+
+        self.cos_margin = math.cos(self.margin)
+        self.sin_margin = math.sin(self.margin)
+
+        # sin(a-b) = sin(a)cos(b) - cos(a)sin(b)
+        # sin(pi - a) = sin(a)
+
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self,
+                inputs: torch.Tensor,
+                label: torch.LongTensor = None
+                ):
+        """
+        :param inputs: shape=[batch_size, ..., hidden_size]
+        :param label:
+        :return: logits
+        """
+        x = F.normalize(inputs)
+        weight = F.normalize(self.weight)
+        cosine = F.linear(x, weight)
+
+        if self.training:
+
+            # sin^2  + cos^2 = 1
+            sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
+
+            # cos(a+b) = cos(a)cos(b) - sin(a)sin(b)
+            cosine_theta_margin = cosine * self.cos_margin - sine * self.sin_margin
+
+            # when the `cosine > - self.cos_margin` there is enough space to add margin on theta.
+            cosine_theta_margin = torch.where(cosine > - self.cos_margin, cosine_theta_margin, cosine - (self.margin * self.sin_margin))
+
+            one_hot = torch.zeros_like(cosine)
+            one_hot.scatter_(1, label.view(-1, 1), 1)
+
+            #
+            logits = torch.where(one_hot == 1, cosine_theta_margin, cosine)
+            logits = logits * self.scale
+        else:
+            logits = cosine
+
+        loss = self.loss(logits, label)
+        # prec1 = accuracy(output.detach(), label.detach(), topk=(1,))[0]
+        return loss
+
+
+class AdditiveAngularMarginLinear(nn.Module):
+    """
+    Alias: ArcFace, AAM-Softmax
+
+    ArcFace: Additive Angular Margin Loss for Deep Face Recognition
+    https://arxiv.org/abs/1801.07698
+
+    参考代码:
+    https://github.com/huangkeju/AAMSoftmax-OpenMax/blob/main/AAMSoftmax%2BOvA/metrics.py
+
+    """
+    @staticmethod
+    def demo1():
+        """
+        角度与数值转换
+        pi / 180 代表 1 度,
+        pi / 180 = 0.01745
+        """
+
+        # 度数转数值
+        degree = 10
+        result = degree * math.pi / 180
+        print(result)
+
+        # 数值转数度
+        radian = 0.2
+        result = radian / (math.pi / 180)
+        print(result)
+
+        return
+
+    @staticmethod
+    def demo2():
+
+        return
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_labels: int,
+                 margin: float = 0.2,
+                 scale: float = 10.0,
+                 ):
+        """
+        :param hidden_size:
+        :param num_labels:
+        :param margin: 建议取值角度为 [10, 30], 对应的数值为 [0.1745, 0.5236]
+        :param scale:
+        """
+        super(AdditiveAngularMarginLinear, self).__init__()
+        self.margin = margin
+        self.scale = scale
+        self.weight = torch.nn.Parameter(torch.FloatTensor(num_labels, hidden_size), requires_grad=True)
+        nn.init.xavier_uniform_(self.weight)
+
+        self.cos_margin = math.cos(self.margin)
+        self.sin_margin = math.sin(self.margin)
+
+        # sin(a-b) = sin(a)cos(b) - cos(a)sin(b)
+        # sin(pi - a) = sin(a)
+
+    def forward(self,
+                inputs: torch.Tensor,
+                targets: torch.LongTensor = None
+                ):
+        """
+        :param inputs: shape=[batch_size, ..., hidden_size]
+        :param targets:
+        :return: logits
+        """
+        x = F.normalize(inputs)
+        weight = F.normalize(self.weight)
+        cosine = F.linear(x, weight)
+
+        if self.training and targets is not None:
+            # sin^2  + cos^2 = 1
+            sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
+
+            # cos(a+b) = cos(a)cos(b) - sin(a)sin(b)
+            cosine_theta_margin = cosine * self.cos_margin - sine * self.sin_margin
+
+            # when the `cosine > - self.cos_margin` there is enough space to add margin on theta.
+            cosine_theta_margin = torch.where(cosine > - self.cos_margin, cosine_theta_margin, cosine - (self.margin * self.sin_margin))
+
+            one_hot = torch.zeros_like(cosine)
+            one_hot.scatter_(1, targets.view(-1, 1), 1)
+
+            logits = torch.where(one_hot == 1, cosine_theta_margin, cosine)
+            logits = logits * self.scale
+        else:
+            logits = cosine
+        return logits
+
+
+def demo1():
+    HingeLoss.demo1()
+    return
+
+
+def demo2():
+    AdditiveAngularMarginSoftMax.demo1()
+
+    inputs = torch.ones(size=(2, 5), dtype=torch.float32)
+    label: torch.LongTensor = torch.tensor(data=[0, 1], dtype=torch.long)
+
+    aam_softmax = AdditiveAngularMarginSoftMax(
+        hidden_size=5,
+        num_labels=2,
+        margin=1,
+        scale=1
+    )
+
+    outputs = aam_softmax.forward(inputs, label)
+    print(outputs)
+
+    return
+
+
+if __name__ == '__main__':
+    # demo1()
+    demo2()
diff --git a/toolbox/torch/training/__init__.py b/toolbox/torch/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bc5155c67cae42f80e8126d1727b0edc1e02398
--- /dev/null
+++ b/toolbox/torch/training/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torch/training/metrics/__init__.py b/toolbox/torch/training/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bc5155c67cae42f80e8126d1727b0edc1e02398
--- /dev/null
+++ b/toolbox/torch/training/metrics/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torch/training/metrics/categorical_accuracy.py b/toolbox/torch/training/metrics/categorical_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..63010cc41c06d859950f08fdaed4f6d5eaddc0c8
--- /dev/null
+++ b/toolbox/torch/training/metrics/categorical_accuracy.py
@@ -0,0 +1,82 @@
+from typing import Optional
+
+from overrides import overrides
+import torch
+
+
+class CategoricalAccuracy(object):
+    def __init__(self, top_k: int = 1, tie_break: bool = False) -> None:
+        if top_k > 1 and tie_break:
+            raise AssertionError("Tie break in Categorical Accuracy "
+                                 "can be done only for maximum (top_k = 1)")
+        if top_k <= 0:
+            raise AssertionError("top_k passed to Categorical Accuracy must be > 0")
+        self._top_k = top_k
+        self._tie_break = tie_break
+        self.correct_count = 0.
+        self.total_count = 0.
+
+    def __call__(self,
+                 predictions: torch.Tensor,
+                 gold_labels: torch.Tensor,
+                 mask: Optional[torch.Tensor] = None):
+
+        # predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask)
+
+        # Some sanity checks.
+        num_classes = predictions.size(-1)
+        if gold_labels.dim() != predictions.dim() - 1:
+            raise AssertionError("gold_labels must have dimension == predictions.size() - 1 but "
+                                 "found tensor of shape: {}".format(predictions.size()))
+        if (gold_labels >= num_classes).any():
+            raise AssertionError("A gold label passed to Categorical Accuracy contains an id >= {}, "
+                                 "the number of classes.".format(num_classes))
+
+        predictions = predictions.view((-1, num_classes))
+        gold_labels = gold_labels.view(-1).long()
+        if not self._tie_break:
+            # Top K indexes of the predictions (or fewer, if there aren't K of them).
+            # Special case topk == 1, because it's common and .max() is much faster than .topk().
+            if self._top_k == 1:
+                top_k = predictions.max(-1)[1].unsqueeze(-1)
+            else:
+                top_k = predictions.topk(min(self._top_k, predictions.shape[-1]), -1)[1]
+
+            # This is of shape (batch_size, ..., top_k).
+            correct = top_k.eq(gold_labels.unsqueeze(-1)).float()
+        else:
+            # prediction is correct if gold label falls on any of the max scores. distribute score by tie_counts
+            max_predictions = predictions.max(-1)[0]
+            max_predictions_mask = predictions.eq(max_predictions.unsqueeze(-1))
+            # max_predictions_mask is (rows X num_classes) and gold_labels is (batch_size)
+            # ith entry in gold_labels points to index (0-num_classes) for ith row in max_predictions
+            # For each row check if index pointed by gold_label is was 1 or not (among max scored classes)
+            correct = max_predictions_mask[torch.arange(gold_labels.numel()).long(), gold_labels].float()
+            tie_counts = max_predictions_mask.sum(-1)
+            correct /= tie_counts.float()
+            correct.unsqueeze_(-1)
+
+        if mask is not None:
+            correct *= mask.view(-1, 1).float()
+            self.total_count += mask.sum()
+        else:
+            self.total_count += gold_labels.numel()
+        self.correct_count += correct.sum()
+
+    def get_metric(self, reset: bool = False):
+        """
+        Returns
+        -------
+        The accumulated accuracy.
+        """
+        if self.total_count > 1e-12:
+            accuracy = float(self.correct_count) / float(self.total_count)
+        else:
+            accuracy = 0.0
+        if reset:
+            self.reset()
+        return {'accuracy': accuracy}
+
+    def reset(self):
+        self.correct_count = 0.0
+        self.total_count = 0.0
diff --git a/toolbox/torch/training/metrics/verbose_categorical_accuracy.py b/toolbox/torch/training/metrics/verbose_categorical_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..45f7a355e0b0e617bd0ed1573b61695d7ace9655
--- /dev/null
+++ b/toolbox/torch/training/metrics/verbose_categorical_accuracy.py
@@ -0,0 +1,128 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+
+
+class CategoricalAccuracyVerbose(object):
+    def __init__(self,
+                 index_to_token: Dict[int, str],
+                 label_namespace: str = "labels",
+                 top_k: int = 1,
+                 ) -> None:
+        if top_k <= 0:
+            raise AssertionError("top_k passed to Categorical Accuracy must be > 0")
+        self._index_to_token = index_to_token
+        self._label_namespace = label_namespace
+        self._top_k = top_k
+        self.correct_count = 0.
+        self.total_count = 0.
+        self.label_correct_count = dict()
+        self.label_total_count = dict()
+
+    def __call__(self,
+                 predictions: torch.Tensor,
+                 gold_labels: torch.Tensor,
+                 mask: Optional[torch.Tensor] = None):
+        num_classes = predictions.size(-1)
+        if gold_labels.dim() != predictions.dim() - 1:
+            raise AssertionError("gold_labels must have dimension == predictions.size() - 1 but "
+                                     "found tensor of shape: {}".format(predictions.size()))
+        if (gold_labels >= num_classes).any():
+            raise AssertionError("A gold label passed to Categorical Accuracy contains an id >= {}, "
+                                     "the number of classes.".format(num_classes))
+
+        predictions = predictions.view((-1, num_classes))
+        gold_labels = gold_labels.view(-1).long()
+
+        # Top K indexes of the predictions (or fewer, if there aren't K of them).
+        # Special case topk == 1, because it's common and .max() is much faster than .topk().
+        if self._top_k == 1:
+            top_k = predictions.max(-1)[1].unsqueeze(-1)
+        else:
+            top_k = predictions.topk(min(self._top_k, predictions.shape[-1]), -1)[1]
+
+        # This is of shape (batch_size, ..., top_k).
+        correct = top_k.eq(gold_labels.unsqueeze(-1)).float()
+
+        if mask is not None:
+            correct *= mask.view(-1, 1).float()
+            self.total_count += mask.sum()
+        else:
+            self.total_count += gold_labels.numel()
+        self.correct_count += correct.sum()
+
+        labels: List[int] = np.unique(gold_labels.cpu().numpy()).tolist()
+        for label in labels:
+            label_mask = (gold_labels == label)
+
+            label_correct = correct * label_mask.view(-1, 1).float()
+            label_correct = int(label_correct.sum())
+            label_count = int(label_mask.sum())
+
+            label_str = self._index_to_token[label]
+            if label_str in self.label_correct_count:
+                self.label_correct_count[label_str] += label_correct
+            else:
+                self.label_correct_count[label_str] = label_correct
+
+            if label_str in self.label_total_count:
+                self.label_total_count[label_str] += label_count
+            else:
+                self.label_total_count[label_str] = label_count
+
+    def get_metric(self, reset: bool = False):
+        """
+        Returns
+        -------
+        The accumulated accuracy.
+        """
+        result = dict()
+        if self.total_count > 1e-12:
+            accuracy = float(self.correct_count) / float(self.total_count)
+        else:
+            accuracy = 0.0
+        result['accuracy'] = accuracy
+
+        for label in self.label_total_count.keys():
+            total = self.label_total_count[label]
+            correct = self.label_correct_count.get(label, 0.0)
+            label_accuracy = correct / total
+            result[label] = label_accuracy
+
+        if reset:
+            self.reset()
+        return result
+
+    def reset(self):
+        self.correct_count = 0.0
+        self.total_count = 0.0
+        self.label_correct_count = dict()
+        self.label_total_count = dict()
+
+
+def demo1():
+
+    categorical_accuracy_verbose = CategoricalAccuracyVerbose(
+        index_to_token={0: '0', 1: '1'},
+        top_k=2,
+    )
+
+    predictions = torch.randn(size=(2, 3), dtype=torch.float32)
+    gold_labels = torch.ones(size=(2,), dtype=torch.long)
+    # print(predictions)
+    # print(gold_labels)
+
+    categorical_accuracy_verbose(
+        predictions=predictions,
+        gold_labels=gold_labels,
+    )
+    metric = categorical_accuracy_verbose.get_metric()
+    print(metric)
+    return
+
+
+if __name__ == '__main__':
+    demo1()
diff --git a/toolbox/torch/training/trainer/__init__.py b/toolbox/torch/training/trainer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aad738e112896111c38ae6624c8632aee62a234
--- /dev/null
+++ b/toolbox/torch/training/trainer/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torch/training/trainer/trainer.py b/toolbox/torch/training/trainer/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aad738e112896111c38ae6624c8632aee62a234
--- /dev/null
+++ b/toolbox/torch/training/trainer/trainer.py
@@ -0,0 +1,5 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torch/utils/__init__.py b/toolbox/torch/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aad738e112896111c38ae6624c8632aee62a234
--- /dev/null
+++ b/toolbox/torch/utils/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torch/utils/data/__init__.py b/toolbox/torch/utils/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aad738e112896111c38ae6624c8632aee62a234
--- /dev/null
+++ b/toolbox/torch/utils/data/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torch/utils/data/dataset/__init__.py b/toolbox/torch/utils/data/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aad738e112896111c38ae6624c8632aee62a234
--- /dev/null
+++ b/toolbox/torch/utils/data/dataset/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torch/utils/data/dataset/wave_classifier_excel_dataset.py b/toolbox/torch/utils/data/dataset/wave_classifier_excel_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbeadda27af80391a03be06ace680915fb16261b
--- /dev/null
+++ b/toolbox/torch/utils/data/dataset/wave_classifier_excel_dataset.py
@@ -0,0 +1,98 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import os
+
+import librosa
+import numpy as np
+import pandas as pd
+from scipy.io import wavfile
+import torch
+import torchaudio
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from toolbox.torch.utils.data.vocabulary import Vocabulary
+
+
+class WaveClassifierExcelDataset(Dataset):
+    def __init__(self,
+                 vocab: Vocabulary,
+                 excel_file: str,
+                 expected_sample_rate: int,
+                 resample: bool = False,
+                 root_path: str = None,
+                 category: str = None,
+                 category_field: str = "category",
+                 label_field: str = "labels",
+                 max_wave_value: float = 1.0,
+                 ) -> None:
+        self.vocab = vocab
+        self.excel_file = excel_file
+
+        self.expected_sample_rate = expected_sample_rate
+        self.resample = resample
+        self.root_path = root_path
+        self.category = category
+        self.category_field = category_field
+        self.label_field = label_field
+        self.max_wave_value = max_wave_value
+
+        df = pd.read_excel(excel_file)
+
+        samples = list()
+        for i, row in tqdm(df.iterrows(), total=len(df)):
+            filename = row["filename"]
+            label = row[self.label_field]
+
+            if self.category is not None and self.category != row[self.category_field]:
+                continue
+
+            samples.append({
+                "filename": filename,
+                "label": label,
+            })
+        self.samples = samples
+
+    def __getitem__(self, index):
+        sample = self.samples[index]
+        filename = sample["filename"]
+        label = sample["label"]
+
+        if self.root_path is not None:
+            filename = os.path.join(self.root_path, filename)
+
+        waveform = self.filename_to_waveform(filename)
+
+        namespace = self.label_field if self.category is None else self.category
+        token_to_index = self.vocab.get_token_to_index_vocabulary(namespace=namespace)
+        label: int = token_to_index[label]
+
+        result = {
+            "waveform": waveform,
+            "label": torch.tensor(label, dtype=torch.int64),
+        }
+        return result
+
+    def __len__(self):
+        return len(self.samples)
+
+    def filename_to_waveform(self, filename: str):
+        try:
+            if self.resample:
+                waveform, sample_rate = librosa.load(filename, sr=self.expected_sample_rate)
+                # waveform, sample_rate = torchaudio.load(filename, normalize=True)
+            else:
+                sample_rate, waveform = wavfile.read(filename)
+                waveform = waveform / self.max_wave_value
+        except ValueError as e:
+            print(filename)
+            raise e
+        if sample_rate != self.expected_sample_rate:
+            raise AssertionError
+
+        waveform = torch.tensor(waveform, dtype=torch.float32)
+        return waveform
+
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/torch/utils/data/vocabulary.py b/toolbox/torch/utils/data/vocabulary.py
new file mode 100644
index 0000000000000000000000000000000000000000..2637d3bb7b4ecd35c27594fbcdf0468b91137bba
--- /dev/null
+++ b/toolbox/torch/utils/data/vocabulary.py
@@ -0,0 +1,211 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from collections import defaultdict, OrderedDict
+import os
+from typing import Any, Callable, Dict, Iterable, List, Set
+
+
+def namespace_match(pattern: str, namespace: str):
+    """
+    Matches a namespace pattern against a namespace string.  For example, ``*tags`` matches
+    ``passage_tags`` and ``question_tags`` and ``tokens`` matches ``tokens`` but not
+    ``stemmed_tokens``.
+    """
+    if pattern[0] == '*' and namespace.endswith(pattern[1:]):
+        return True
+    elif pattern == namespace:
+        return True
+    return False
+
+
+class _NamespaceDependentDefaultDict(defaultdict):
+    def __init__(self,
+                 non_padded_namespaces: Set[str],
+                 padded_function: Callable[[], Any],
+                 non_padded_function: Callable[[], Any]) -> None:
+        self._non_padded_namespaces = set(non_padded_namespaces)
+        self._padded_function = padded_function
+        self._non_padded_function = non_padded_function
+        super(_NamespaceDependentDefaultDict, self).__init__()
+
+    def __missing__(self, key: str):
+        if any(namespace_match(pattern, key) for pattern in self._non_padded_namespaces):
+            value = self._non_padded_function()
+        else:
+            value = self._padded_function()
+        dict.__setitem__(self, key, value)
+        return value
+
+    def add_non_padded_namespaces(self, non_padded_namespaces: Set[str]):
+        # add non_padded_namespaces which weren't already present
+        self._non_padded_namespaces.update(non_padded_namespaces)
+
+
+class _TokenToIndexDefaultDict(_NamespaceDependentDefaultDict):
+    def __init__(self, non_padded_namespaces: Set[str], padding_token: str, oov_token: str) -> None:
+        super(_TokenToIndexDefaultDict, self).__init__(non_padded_namespaces,
+                                                       lambda: {padding_token: 0, oov_token: 1},
+                                                       lambda: {})
+
+
+class _IndexToTokenDefaultDict(_NamespaceDependentDefaultDict):
+    def __init__(self, non_padded_namespaces: Set[str], padding_token: str, oov_token: str) -> None:
+        super(_IndexToTokenDefaultDict, self).__init__(non_padded_namespaces,
+                                                       lambda: {0: padding_token, 1: oov_token},
+                                                       lambda: {})
+
+
+DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels")
+DEFAULT_PADDING_TOKEN = '[PAD]'
+DEFAULT_OOV_TOKEN = '[UNK]'
+NAMESPACE_PADDING_FILE = 'non_padded_namespaces.txt'
+
+
+class Vocabulary(object):
+    def __init__(self, non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES):
+        self._non_padded_namespaces = set(non_padded_namespaces)
+        self._padding_token = DEFAULT_PADDING_TOKEN
+        self._oov_token = DEFAULT_OOV_TOKEN
+        self._token_to_index = _TokenToIndexDefaultDict(self._non_padded_namespaces,
+                                                        self._padding_token,
+                                                        self._oov_token)
+        self._index_to_token = _IndexToTokenDefaultDict(self._non_padded_namespaces,
+                                                        self._padding_token,
+                                                        self._oov_token)
+
+    def add_token_to_namespace(self, token: str, namespace: str = 'tokens') -> int:
+        if token not in self._token_to_index[namespace]:
+            index = len(self._token_to_index[namespace])
+            self._token_to_index[namespace][token] = index
+            self._index_to_token[namespace][index] = token
+            return index
+        else:
+            return self._token_to_index[namespace][token]
+
+    def get_index_to_token_vocabulary(self, namespace: str = 'tokens') -> Dict[int, str]:
+        return self._index_to_token[namespace]
+
+    def get_token_to_index_vocabulary(self, namespace: str = 'tokens') -> Dict[str, int]:
+        return self._token_to_index[namespace]
+
+    def get_token_index(self, token: str, namespace: str = 'tokens') -> int:
+        if token in self._token_to_index[namespace]:
+            return self._token_to_index[namespace][token]
+        else:
+            return self._token_to_index[namespace][self._oov_token]
+
+    def get_token_from_index(self, index: int, namespace: str = 'tokens'):
+        return self._index_to_token[namespace][index]
+
+    def get_vocab_size(self, namespace: str = 'tokens') -> int:
+        return len(self._token_to_index[namespace])
+
+    def save_to_files(self, directory: str):
+        os.makedirs(directory, exist_ok=True)
+        with open(os.path.join(directory, NAMESPACE_PADDING_FILE), 'w', encoding='utf-8') as f:
+            for namespace_str in self._non_padded_namespaces:
+                f.write('{}\n'.format(namespace_str))
+
+        for namespace, token_to_index in self._token_to_index.items():
+            filename = os.path.join(directory, '{}.txt'.format(namespace))
+            with open(filename, 'w', encoding='utf-8') as f:
+                for token, _ in token_to_index.items():
+                    f.write('{}\n'.format(token))
+
+    @classmethod
+    def from_files(cls, directory: str) -> 'Vocabulary':
+        with open(os.path.join(directory, NAMESPACE_PADDING_FILE), 'r', encoding='utf-8') as f:
+            non_padded_namespaces = [namespace_str.strip() for namespace_str in f]
+
+        vocab = cls(non_padded_namespaces=non_padded_namespaces)
+
+        for namespace_filename in os.listdir(directory):
+            if namespace_filename == NAMESPACE_PADDING_FILE:
+                continue
+            if namespace_filename.startswith("."):
+                continue
+            namespace = namespace_filename.replace('.txt', '')
+            if any(namespace_match(pattern, namespace) for pattern in non_padded_namespaces):
+                is_padded = False
+            else:
+                is_padded = True
+            filename = os.path.join(directory, namespace_filename)
+            vocab.set_from_file(filename, is_padded, namespace=namespace)
+
+        return vocab
+
+    def set_from_file(self,
+                      filename: str,
+                      is_padded: bool = True,
+                      oov_token: str = DEFAULT_OOV_TOKEN,
+                      namespace: str = "tokens"
+                      ):
+        if is_padded:
+            self._token_to_index[namespace] = {self._padding_token: 0}
+            self._index_to_token[namespace] = {0: self._padding_token}
+        else:
+            self._token_to_index[namespace] = {}
+            self._index_to_token[namespace] = {}
+
+        with open(filename, 'r', encoding='utf-8') as f:
+            index = 1 if is_padded else 0
+            for row in f:
+                token = str(row).strip()
+                if token == oov_token:
+                    token = self._oov_token
+                self._token_to_index[namespace][token] = index
+                self._index_to_token[namespace][index] = token
+                index += 1
+
+    def convert_tokens_to_ids(self, tokens: List[str], namespace: str = "tokens"):
+        result = list()
+        for token in tokens:
+            idx = self._token_to_index[namespace].get(token)
+            if idx is None:
+                idx = self._token_to_index[namespace][self._oov_token]
+            result.append(idx)
+        return result
+
+    def convert_ids_to_tokens(self, ids: List[int], namespace: str = "tokens"):
+        result = list()
+        for idx in ids:
+            idx = self._index_to_token[namespace][idx]
+            result.append(idx)
+        return result
+
+    def pad_or_truncate_ids_by_max_length(self, ids: List[int], max_length: int, namespace: str = "tokens"):
+        pad_idx = self._token_to_index[namespace][self._padding_token]
+
+        length = len(ids)
+        if length > max_length:
+            result = ids[:max_length]
+        else:
+            result = ids + [pad_idx] * (max_length - length)
+        return result
+
+
+def demo1():
+    import jieba
+
+    vocabulary = Vocabulary()
+    vocabulary.add_token_to_namespace('白天', 'tokens')
+    vocabulary.add_token_to_namespace('晚上', 'tokens')
+
+    text = '不是在白天, 就是在晚上'
+    tokens = jieba.lcut(text)
+
+    print(tokens)
+
+    ids = vocabulary.convert_tokens_to_ids(tokens)
+    print(ids)
+
+    padded_idx = vocabulary.pad_or_truncate_ids_by_max_length(ids, 10)
+    print(padded_idx)
+
+    tokens = vocabulary.convert_ids_to_tokens(padded_idx)
+    print(tokens)
+    return
+
+
+if __name__ == '__main__':
+    demo1()
diff --git a/toolbox/torchaudio/__init__.py b/toolbox/torchaudio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aad738e112896111c38ae6624c8632aee62a234
--- /dev/null
+++ b/toolbox/torchaudio/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torchaudio/configuration_utils.py b/toolbox/torchaudio/configuration_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ef0d2009681d4aa0c9b79a3728aa40622d0fe85
--- /dev/null
+++ b/toolbox/torchaudio/configuration_utils.py
@@ -0,0 +1,63 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import copy
+import os
+from typing import Any, Dict, Union
+
+import yaml
+
+
+CONFIG_FILE = "config.yaml"
+
+
+class PretrainedConfig(object):
+    def __init__(self, **kwargs):
+        pass
+
+    @classmethod
+    def _dict_from_yaml_file(cls, yaml_file: Union[str, os.PathLike]):
+        with open(yaml_file, encoding="utf-8") as f:
+            config_dict = yaml.safe_load(f)
+        return config_dict
+
+    @classmethod
+    def get_config_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike]
+    ) -> Dict[str, Any]:
+        if os.path.isdir(pretrained_model_name_or_path):
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_FILE)
+        else:
+            config_file = pretrained_model_name_or_path
+        config_dict = cls._dict_from_yaml_file(config_file)
+        return config_dict
+
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs):
+        for k, v in kwargs.items():
+            if k in config_dict.keys():
+                config_dict[k] = v
+        config = cls(**config_dict)
+        return config
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ):
+        config_dict = cls.get_config_dict(pretrained_model_name_or_path)
+        return cls.from_dict(config_dict, **kwargs)
+
+    def to_dict(self):
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_yaml_file(self, yaml_file_path: Union[str, os.PathLike]):
+        config_dict = self.to_dict()
+
+        with open(yaml_file_path, "w", encoding="utf-8") as writer:
+            yaml.safe_dump(config_dict, writer)
+
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torchaudio/models/__init__.py b/toolbox/torchaudio/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aad738e112896111c38ae6624c8632aee62a234
--- /dev/null
+++ b/toolbox/torchaudio/models/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torchaudio/models/cnn_audio_classifier/__init__.py b/toolbox/torchaudio/models/cnn_audio_classifier/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aad738e112896111c38ae6624c8632aee62a234
--- /dev/null
+++ b/toolbox/torchaudio/models/cnn_audio_classifier/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+if __name__ == '__main__':
+    pass
diff --git a/toolbox/torchaudio/models/cnn_audio_classifier/configuration_cnn_audio_classifier.py b/toolbox/torchaudio/models/cnn_audio_classifier/configuration_cnn_audio_classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..92dc434987df1761b70a25b3339c2d50d82f1901
--- /dev/null
+++ b/toolbox/torchaudio/models/cnn_audio_classifier/configuration_cnn_audio_classifier.py
@@ -0,0 +1,24 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Any, Dict, List, Tuple, Union
+
+from toolbox.torchaudio.configuration_utils import PretrainedConfig
+
+
+class CnnAudioClassifierConfig(PretrainedConfig):
+    def __init__(self,
+                 mel_spectrogram_param: dict,
+                 cls_head_param: dict,
+                 conv1d_block_param_list: List[dict] = None,
+                 conv2d_block_param_list: List[dict] = None,
+                 **kwargs
+                 ):
+        super(CnnAudioClassifierConfig, self).__init__(**kwargs)
+        self.mel_spectrogram_param = mel_spectrogram_param
+        self.cls_head_param = cls_head_param
+        self.conv1d_block_param_list = conv1d_block_param_list
+        self.conv2d_block_param_list = conv2d_block_param_list
+
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/torchaudio/models/cnn_audio_classifier/examples/conv2d_classifier.yaml b/toolbox/torchaudio/models/cnn_audio_classifier/examples/conv2d_classifier.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b68ec96b24e6c511852f9aed0ad3fd518c73d40
--- /dev/null
+++ b/toolbox/torchaudio/models/cnn_audio_classifier/examples/conv2d_classifier.yaml
@@ -0,0 +1,45 @@
+model_name: "cnn_audio_classifier"
+
+mel_spectrogram_param:
+  sample_rate: 8000
+  n_fft: 512
+  win_length: 200
+  hop_length: 80
+  f_min: 10
+  f_max: 3800
+  window_fn: hamming
+  n_mels: 80
+
+conv2d_block_param_list:
+  - batch_norm: true
+    in_channels: 1
+    out_channels: 4
+    kernel_size: 3
+    stride: 1
+    dilation: 3
+    activation: relu
+    dropout: 0.1
+  - in_channels: 4
+    out_channels: 4
+    kernel_size: 5
+    stride: 2
+    dilation: 3
+    activation: relu
+    dropout: 0.1
+  - in_channels: 4
+    out_channels: 4
+    kernel_size: 3
+    stride: 1
+    dilation: 2
+    activation: relu
+    dropout: 0.1
+
+cls_head:
+  input_dim: 352
+  num_layers: 2
+  hidden_dims:
+    - 128
+    - 32
+  activations: relu
+  dropout: 0.1
+  num_labels: 3
diff --git a/toolbox/torchaudio/models/cnn_audio_classifier/modeling_cnn_audio_classifier.py b/toolbox/torchaudio/models/cnn_audio_classifier/modeling_cnn_audio_classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..3955473333a50cfdfdefee1f6796433e33b75b86
--- /dev/null
+++ b/toolbox/torchaudio/models/cnn_audio_classifier/modeling_cnn_audio_classifier.py
@@ -0,0 +1,403 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import os
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torchaudio
+
+from toolbox.torchaudio.models.cnn_audio_classifier.configuration_cnn_audio_classifier import CnnAudioClassifierConfig
+from toolbox.torchaudio.configuration_utils import CONFIG_FILE
+
+
+MODEL_FILE = "model.pt"
+
+
+name2activation = {
+    "relu": nn.ReLU,
+}
+
+
+class Conv1dBlock(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int,
+                 stride: Tuple[int, int],
+                 padding: str = 0,
+                 dilation: int = 1,
+                 batch_norm: bool = False,
+                 activation: str = None,
+                 dropout: float = None,
+                 ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+
+        if batch_norm:
+            self.batch_norm = nn.BatchNorm1d(in_channels)
+        else:
+            self.batch_norm = None
+
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size=(kernel_size,),
+            stride=stride,
+            padding=padding,
+            dilation=(dilation,),
+        )
+
+        if activation is None:
+            self.activation = None
+        else:
+            self.activation = name2activation[activation]()
+
+        if dropout is not None:
+            self.dropout = nn.Dropout(p=dropout)
+        else:
+            self.dropout = None
+
+    def forward(self, x):
+        # x: [batch_size, seq_length, spec_dim]
+        x = torch.transpose(x, dim0=-1, dim1=-2)
+
+        # x: [batch_size, spec_dim, seq_length]
+        if self.batch_norm is not None:
+            x = self.batch_norm(x)
+
+        x = self.conv(x)
+
+        if self.activation is not None:
+            x = self.activation(x)
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        x = torch.transpose(x, dim0=-1, dim1=-2)
+        # x: [batch_size, seq_length, spec_dim]
+        return x
+
+
+class Conv2dBlock(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Tuple[int, int],
+                 padding: str = 0,
+                 dilation: int = 1,
+                 batch_norm: bool = False,
+                 activation: str = None,
+                 dropout: float = None,
+                 ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size: Tuple[int, int] = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
+
+        if batch_norm:
+            self.batch_norm = nn.BatchNorm2d(in_channels)
+        else:
+            self.batch_norm = None
+
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(padding,),
+            dilation=(dilation,),
+        )
+
+        if activation is None:
+            self.activation = None
+        else:
+            self.activation = name2activation[activation]()
+
+        if dropout is not None:
+            self.dropout = nn.Dropout(p=dropout)
+        else:
+            self.dropout = None
+
+    def forward(self, x):
+
+        if self.batch_norm is not None:
+            x = self.batch_norm(x)
+
+        x = self.conv(x)
+
+        if self.activation is not None:
+            x = self.activation(x)
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        return x
+
+
+class FeedForward(nn.Module):
+    def __init__(self,
+                 input_dim: int,
+                 num_layers: int,
+                 hidden_dims: Union[int, List[int]],
+                 activations: Union[str, List[str]],
+                 dropout: Union[float, List[float]] = 0.0) -> None:
+
+        super(FeedForward, self).__init__()
+        if not isinstance(hidden_dims, list):
+            hidden_dims = [hidden_dims] * num_layers  # type: ignore
+        if not isinstance(activations, list):
+            activations = [activations] * num_layers  # type: ignore
+        if not isinstance(dropout, list):
+            dropout = [dropout] * num_layers  # type: ignore
+        if len(hidden_dims) != num_layers:
+            raise AssertionError("len(hidden_dims) (%d) != num_layers (%d)" %
+                                 (len(hidden_dims), num_layers))
+        if len(activations) != num_layers:
+            raise AssertionError("len(activations) (%d) != num_layers (%d)" %
+                                 (len(activations), num_layers))
+        if len(dropout) != num_layers:
+            raise AssertionError("len(dropout) (%d) != num_layers (%d)" %
+                                 (len(dropout), num_layers))
+        self._activations = torch.nn.ModuleList([name2activation[activation]() for activation in activations])
+
+        input_dims = [input_dim] + hidden_dims[:-1]
+        linear_layers = []
+        for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims):
+            linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim))
+        self._linear_layers = torch.nn.ModuleList(linear_layers)
+        dropout_layers = [torch.nn.Dropout(p=value) for value in dropout]
+        self._dropout = torch.nn.ModuleList(dropout_layers)
+        self.output_dim = hidden_dims[-1]
+        self.input_dim = input_dim
+
+    def get_output_dim(self):
+        return self.output_dim
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        output = inputs
+        for layer, activation, dropout in zip(self._linear_layers, self._activations, self._dropout):
+            output = dropout(activation(layer(output)))
+        return output
+
+
+class SpectrogramEncoder(nn.Module):
+    def __init__(self,
+                 conv1d_block_param_list: List[dict] = None,
+                 conv2d_block_param_list: List[dict] = None,
+                 ):
+        super(SpectrogramEncoder, self).__init__()
+        if conv1d_block_param_list is None and conv2d_block_param_list is None:
+            raise AssertionError(
+                "At least one of the `conv1d_block_param_list` and `conv2d_block_param_list` is required."
+            )
+
+        self.conv1d_block_list = None
+        if conv1d_block_param_list is not None:
+            self.conv1d_block_list = nn.ModuleList(modules=[
+                Conv1dBlock(
+                    **conv1d_block_param
+                )
+                for conv1d_block_param in conv1d_block_param_list
+            ])
+
+        self.conv2d_block_list = None
+        if conv2d_block_param_list is not None:
+            self.conv2d_block_list = nn.ModuleList(modules=[
+                Conv2dBlock(**conv2d_block_param)
+                for conv2d_block_param in conv2d_block_param_list
+            ])
+
+    def forward(self,
+                inputs: torch.Tensor,
+                ):
+        # x: [batch_size, spec_dim, seq_length]
+        x = inputs
+
+        if self.conv1d_block_list is not None:
+            for conv1d_block in self.conv1d_block_list:
+                x = conv1d_block(x)
+
+        if self.conv2d_block_list is not None:
+            x = torch.unsqueeze(x, dim=1)
+            # x: [batch_size, channel, seq_length, spec_dim]
+            for conv2d_block in self.conv2d_block_list:
+                x = conv2d_block(x)
+
+            # x: [batch_size, channel, seq_length, spec_dim]
+            x = torch.transpose(x, dim0=1, dim1=2)
+            # x: [batch_size, seq_length, channel, spec_dim]
+            batch_size, seq_length, channel, spec_dim = x.shape
+            x = torch.reshape(x, shape=(batch_size, seq_length, -1))
+
+        # x: [batch_size, seq_length, spec_dim]
+        return x
+
+
+class WaveEncoder(nn.Module):
+    def __init__(self,
+                 mel_spectrogram_param: dict,
+                 conv1d_block_param_list: List[dict] = None,
+                 conv2d_block_param_list: List[dict] = None,
+                 ):
+        super(WaveEncoder, self).__init__()
+        if conv1d_block_param_list is None and conv2d_block_param_list is None:
+            raise AssertionError(
+                "At least one of the `conv1d_block_param_list` and `conv2d_block_param_list` is required."
+            )
+
+        self.wave_to_mel_spectrogram = torch.nn.Sequential(
+            torchaudio.transforms.MelSpectrogram(
+                sample_rate=mel_spectrogram_param["sample_rate"],
+                n_fft=mel_spectrogram_param["n_fft"],
+                win_length=mel_spectrogram_param["win_length"],
+                hop_length=mel_spectrogram_param["hop_length"],
+                f_min=mel_spectrogram_param["f_min"],
+                f_max=mel_spectrogram_param["f_max"],
+                window_fn=torch.hamming_window if mel_spectrogram_param["window_fn"] == "hamming" else torch.hann_window,
+                n_mels=mel_spectrogram_param["n_mels"],
+            ),
+        )
+
+        self.spectrogram_encoder = SpectrogramEncoder(
+            conv1d_block_param_list=conv1d_block_param_list,
+            conv2d_block_param_list=conv2d_block_param_list,
+        )
+
+    def forward(self, inputs: torch.Tensor):
+        # x: [batch_size, spec_dim, seq_length]
+        x = inputs
+
+        with torch.no_grad():
+            # shape = [batch_size, spec_dim, seq_length]
+            x = self.wave_to_mel_spectrogram(x) + 1e-6
+            x = x.log()
+            x = x - torch.mean(x, dim=-1, keepdim=True)
+
+        x = x.transpose(1, 2)
+
+        features = self.spectrogram_encoder.forward(x)
+        # features: [batch_size, seq_length, spec_dim]
+        return features
+
+
+class ClsHead(nn.Module):
+    def __init__(self,
+                 input_dim: int,
+                 num_layers: int,
+                 hidden_dims: Union[int, List[int]],
+                 activations: Union[str, List[str]],
+                 num_labels: int,
+                 dropout: Union[float, List[float]] = 0.0
+                 ):
+        super(ClsHead, self).__init__()
+
+        self.feedforward = FeedForward(
+            input_dim=input_dim,
+            num_layers=num_layers,
+            hidden_dims=hidden_dims,
+            activations=activations,
+            dropout=dropout,
+        )
+
+        self.output_project_layer = nn.Linear(self.feedforward.get_output_dim(), num_labels)
+
+    def forward(self, inputs: torch.Tensor):
+        # inputs: [batch_size, seq_length, spec_dim]
+        x = self.feedforward(inputs)
+        # x: [batch_size, seq_length, hidden_size]
+
+        x = torch.mean(x, dim=1)
+        # x: [batch_size, hidden_size]
+
+        logits = self.output_project_layer.forward(x)
+        # logits: [batch_size, num_labels]
+        return logits
+
+
+class WaveClassifier(nn.Module):
+    def __init__(self,
+                 wave_encoder: WaveEncoder,
+                 cls_head: ClsHead,
+                 ):
+        super(WaveClassifier, self).__init__()
+        self.wave_encoder = wave_encoder
+        self.cls_head = cls_head
+
+    def forward(self, inputs: torch.Tensor):
+        # x: [batch_size, spec_dim, seq_length]
+        x = inputs
+
+        x = self.wave_encoder.forward(x)
+
+        # x: [batch_size, seq_length, spec_dim]
+        logits = self.cls_head.forward(x)
+
+        # logits: [batch_size, num_labels]
+        return logits
+
+
+class WaveClassifierPretrainedModel(WaveClassifier):
+    def __init__(self,
+                 config: CnnAudioClassifierConfig,
+                 ):
+        super(WaveClassifierPretrainedModel, self).__init__(
+            wave_encoder=WaveEncoder(
+                mel_spectrogram_param=config.mel_spectrogram_param,
+                conv1d_block_param_list=config.conv1d_block_param_list,
+                conv2d_block_param_list=config.conv2d_block_param_list,
+            ),
+            cls_head=ClsHead(
+                input_dim=config.cls_head_param["input_dim"],
+                num_layers=config.cls_head_param["num_layers"],
+                hidden_dims=config.cls_head_param["hidden_dims"],
+                activations=config.cls_head_param["activations"],
+                num_labels=config.cls_head_param["num_labels"],
+                dropout=config.cls_head_param["dropout"],
+            )
+        )
+        self.config = config
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        config = CnnAudioClassifierConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        model = cls(config)
+
+        if os.path.isdir(pretrained_model_name_or_path):
+            ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE)
+        else:
+            ckpt_file = pretrained_model_name_or_path
+
+        with open(ckpt_file, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+        return model
+
+    def save_pretrained(self,
+                        save_directory: Union[str, os.PathLike],
+                        state_dict: Optional[dict] = None,
+                        ):
+
+        model = self
+
+        if state_dict is None:
+            state_dict = model.state_dict()
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # save state dict
+        model_file = os.path.join(save_directory, MODEL_FILE)
+        torch.save(state_dict, model_file)
+
+        # save config
+        config_file = os.path.join(save_directory, CONFIG_FILE)
+        self.config.to_yaml_file(config_file)
+        return save_directory
+
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/torchaudio/models/lstm_audio_classifier/__init__.py b/toolbox/torchaudio/models/lstm_audio_classifier/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..81a66fc40cec5e1bad20c94ebc03002f9772eb07
--- /dev/null
+++ b/toolbox/torchaudio/models/lstm_audio_classifier/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/torchaudio/models/lstm_audio_classifier/configuration_lstm_audio_classifier.py b/toolbox/torchaudio/models/lstm_audio_classifier/configuration_lstm_audio_classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..56ebcd19e3fcf5faa8a5ac536b17d972ccdd7a69
--- /dev/null
+++ b/toolbox/torchaudio/models/lstm_audio_classifier/configuration_lstm_audio_classifier.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from toolbox.torchaudio.configuration_utils import PretrainedConfig
+
+
+class WaveClassifierConfig(PretrainedConfig):
+    def __init__(self,
+                 mel_spectrogram_param: dict,
+                 lstm_layer_param: dict,
+                 pooling_layer_param: dict,
+                 cls_head_param: dict,
+                 **kwargs
+                 ):
+        super(WaveClassifierConfig, self).__init__(**kwargs)
+        self.mel_spectrogram_param = mel_spectrogram_param
+        self.lstm_layer_param = lstm_layer_param
+        self.pooling_layer_param = pooling_layer_param
+        self.cls_head_param = cls_head_param
+
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/torchaudio/models/lstm_audio_classifier/examples/lstm_classifier.yaml b/toolbox/torchaudio/models/lstm_audio_classifier/examples/lstm_classifier.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a36c83cecb1e76caa02eb9abaf039274247b8e12
--- /dev/null
+++ b/toolbox/torchaudio/models/lstm_audio_classifier/examples/lstm_classifier.yaml
@@ -0,0 +1,29 @@
+model_name: "lstm_audio_classifier"
+
+mel_spectrogram_param:
+  sample_rate: 8000
+  n_fft: 512
+  win_length: 200
+  hop_length: 80
+  f_min: 10
+  f_max: 3800
+  window_fn: hamming
+  n_mels: 80
+
+lstm_layer_param:
+  input_size: 80
+  hidden_size: 64
+  num_layers: 3
+  dropout: 0.2
+
+pooling_layer_param:
+  pool_layer: last
+
+cls_head_param:
+  input_dim: 64
+  num_layers: 1
+  hidden_dims:
+    - 32
+  activations: relu
+  dropout: 0.1
+  num_labels: 4
diff --git a/toolbox/torchaudio/models/lstm_audio_classifier/modeling_lstm_audio_classifier.py b/toolbox/torchaudio/models/lstm_audio_classifier/modeling_lstm_audio_classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..baa04fc9a799f597abd00a84466618b989568cf6
--- /dev/null
+++ b/toolbox/torchaudio/models/lstm_audio_classifier/modeling_lstm_audio_classifier.py
@@ -0,0 +1,364 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import os
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torchaudio
+import torch.nn as nn
+from toolbox.torchaudio.configuration_utils import CONFIG_FILE, PretrainedConfig
+from toolbox.torchaudio.models.lstm_audio_classifier.configuration_lstm_audio_classifier import WaveClassifierConfig
+from toolbox.torchaudio.modules.conv_stft import ConvSTFT
+from toolbox.torchaudio.modules.freq_bands.mel_bands import MelBands
+
+
+MODEL_FILE = "model.pt"
+
+
+name2activation = {
+    "relu": nn.ReLU,
+}
+
+
+class FeedForward(nn.Module):
+    def __init__(self,
+                 input_dim: int,
+                 num_layers: int,
+                 hidden_dims: Union[int, List[int]],
+                 activations: Union[str, List[str]],
+                 dropout: Union[float, List[float]] = 0.0) -> None:
+
+        super(FeedForward, self).__init__()
+        if not isinstance(hidden_dims, list):
+            hidden_dims = [hidden_dims] * num_layers  # type: ignore
+        if not isinstance(activations, list):
+            activations = [activations] * num_layers  # type: ignore
+        if not isinstance(dropout, list):
+            dropout = [dropout] * num_layers  # type: ignore
+        if len(hidden_dims) != num_layers:
+            raise AssertionError("len(hidden_dims) (%d) != num_layers (%d)" %
+                                 (len(hidden_dims), num_layers))
+        if len(activations) != num_layers:
+            raise AssertionError("len(activations) (%d) != num_layers (%d)" %
+                                 (len(activations), num_layers))
+        if len(dropout) != num_layers:
+            raise AssertionError("len(dropout) (%d) != num_layers (%d)" %
+                                 (len(dropout), num_layers))
+        self._activations = torch.nn.ModuleList([name2activation[activation]() for activation in activations])
+
+        input_dims = [input_dim] + hidden_dims[:-1]
+        linear_layers = []
+        for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims):
+            linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim))
+        self._linear_layers = torch.nn.ModuleList(linear_layers)
+        dropout_layers = [torch.nn.Dropout(p=value) for value in dropout]
+        self._dropout = torch.nn.ModuleList(dropout_layers)
+        self.output_dim = hidden_dims[-1]
+        self.input_dim = input_dim
+
+    def get_output_dim(self):
+        return self.output_dim
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        output = inputs
+        for layer, activation, dropout in zip(self._linear_layers, self._activations, self._dropout):
+            output = dropout(activation(layer(output)))
+        return output
+
+
+class PoolingLayer(nn.Module):
+    def __init__(self,
+                 pool_layer: str,
+                 ):
+        super(PoolingLayer, self).__init__()
+        # mean, last
+        self.pool_layer = pool_layer
+
+    def forward(self, inputs: torch.Tensor):
+        # inputs shape: [b, t, f]
+        if self.pool_layer == "mean":
+           inputs = torch.mean(inputs, dim=1)
+        elif self.pool_layer == "last":
+            inputs = inputs[:, -1, :]
+        else:
+            raise ValueError("pool_layer must be mean or last")
+        # inputs shape: [b, f]
+        return inputs
+
+
+class LSTMLayer(nn.Module):
+    def __init__(self,
+                 input_size: int,
+                 hidden_size: int,
+                 num_layers: int,
+                 dropout: float = 0.0,
+                 ):
+        super(LSTMLayer, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.dropout = dropout if num_layers > 1 else 0.0
+
+        self.lstm = nn.LSTM(
+            input_size=self.input_size,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_layers,
+            dropout=self.dropout,
+            batch_first=True
+        )
+
+    def forward(self, inputs: torch.Tensor, h: Optional[torch.Tensor] = None, c: Optional[torch.Tensor] = None):
+        """
+        :param inputs: shape, [b, t, f]
+        :param h: shape, [num_layers, b, hidden_size]
+        :param c: shape, [num_layers, b, hidden_size]
+        :return:
+            features: shape, [b, hidden_size]
+            h: shape, [num_layers, b, hidden_size]
+            c: shape, [num_layers, b, hidden_size]
+        """
+        if h is None or c is None:
+            batch_size = inputs.size(0)
+            h, c = self._init_hidden(batch_size, inputs.device)
+        if inputs.dim() == 4:
+            # [b, 1, t, f]
+            inputs = inputs.squeeze(1)
+            # [b, t, f]
+
+        # [b, t, f]
+        features, (h, c) = self.lstm(inputs, (h, c))
+        return features, h, c
+
+    def _init_hidden(self, batch_size: int, device: torch.device):
+        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
+        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
+        return h0, c0
+
+
+class WaveEncoder(nn.Module):
+    def __init__(self,
+                 mel_spectrogram_param: dict,
+                 lstm_layer_param: dict,
+                 ):
+        super().__init__()
+        self.mel_spectrogram_param = mel_spectrogram_param
+        self.lstm_layer_param = lstm_layer_param
+
+        self.stft = ConvSTFT(
+            nfft=mel_spectrogram_param["n_fft"],
+            win_size=mel_spectrogram_param["win_length"],
+            hop_size=mel_spectrogram_param["hop_length"],
+            win_type=mel_spectrogram_param["window_fn"],
+            power=True,
+            requires_grad=False,
+        )
+        self.mel_scale = MelBands(
+            n_fft=mel_spectrogram_param["n_fft"],
+            n_mels=mel_spectrogram_param["n_mels"],
+            sample_rate=mel_spectrogram_param["sample_rate"],
+            f_min=mel_spectrogram_param["f_min"],
+            f_max=mel_spectrogram_param["f_max"],
+        )
+        self.lstm_layer = LSTMLayer(
+            input_size=lstm_layer_param["input_size"],
+            hidden_size=lstm_layer_param["hidden_size"],
+            num_layers=lstm_layer_param["num_layers"],
+            dropout=lstm_layer_param["dropout"],
+        )
+
+    def forward(self, inputs: torch.Tensor):
+        # x: [b, num_samples]
+        x = inputs
+
+        with torch.no_grad():
+            x = self.stft.forward(x)
+            # shape = [b, f, t]
+            x = x.transpose(1, 2)
+            # shape = [b, t, f]
+            x = self.mel_scale.mel_scale(x)
+            # shape = [b, t, mel_bins]
+            x = x + 1e-6
+            x = x.log()
+
+        # x shape = [b, t, mel_bins]
+        features, h, c = self.lstm_layer.forward(x)
+        # features: shape, [b, t, hidden_size]
+        # h: shape, [num_layers, b, hidden_size]
+        # c: shape, [num_layers, b, hidden_size]
+        return features
+
+
+class ClsHead(nn.Module):
+    def __init__(self,
+                 input_dim: int,
+                 num_layers: int,
+                 hidden_dims: Union[int, List[int]],
+                 activations: Union[str, List[str]],
+                 num_labels: int,
+                 dropout: Union[float, List[float]] = 0.0
+                 ):
+        super(ClsHead, self).__init__()
+
+        self.feedforward = FeedForward(
+            input_dim=input_dim,
+            num_layers=num_layers,
+            hidden_dims=hidden_dims,
+            activations=activations,
+            dropout=dropout,
+        )
+
+        self.output_project_layer = nn.Linear(self.feedforward.get_output_dim(), num_labels)
+
+    def forward(self, inputs: torch.Tensor):
+        # inputs: [b, f]
+        inputs = torch.unsqueeze(inputs, dim=1)
+        # inputs: [b, 1, f]
+        x = self.feedforward(inputs)
+        # inputs: [b, 1, f]
+        x = torch.squeeze(x, dim=1)
+        # x: [b, f]
+
+        logits = self.output_project_layer.forward(x)
+        # logits: [b, num_labels]
+        return logits
+
+
+class WaveClassifier(nn.Module):
+    def __init__(self,
+                 wave_encoder: WaveEncoder,
+                 pooling_layer: PoolingLayer,
+                 cls_head: ClsHead
+                 ):
+        super(WaveClassifier, self).__init__()
+        self.wave_encoder = wave_encoder
+        self.pooling_layer = pooling_layer
+        self.cls_head = cls_head
+
+    def forward(self,
+                inputs: torch.Tensor,
+                ):
+        # inputs shape: [b, num_samples]
+        features = self.wave_encoder.forward(inputs)
+        # features shape: [b, t, f]
+        feature = self.pooling_layer.forward(features)
+        # features shape: [b, f]
+        logits = self.cls_head.forward(feature)
+        # logits shape: [batch_size, num_classes]
+        return logits
+
+
+class WaveClassifierPretrainedModel(WaveClassifier):
+    def __init__(self,
+                 config: WaveClassifierConfig,
+                 ):
+        super(WaveClassifierPretrainedModel, self).__init__(
+            wave_encoder=WaveEncoder(
+                mel_spectrogram_param=config.mel_spectrogram_param,
+                lstm_layer_param=config.lstm_layer_param,
+            ),
+            pooling_layer=PoolingLayer(
+                pool_layer=config.pooling_layer_param["pool_layer"],
+            ),
+            cls_head=ClsHead(
+                input_dim=config.cls_head_param["input_dim"],
+                num_layers=config.cls_head_param["num_layers"],
+                hidden_dims=config.cls_head_param["hidden_dims"],
+                activations=config.cls_head_param["activations"],
+                num_labels=config.cls_head_param["num_labels"],
+                dropout=config.cls_head_param["dropout"],
+            )
+        )
+        self.config = config
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
+        config = WaveClassifierConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        model = cls(config)
+
+        if os.path.isdir(pretrained_model_name_or_path):
+            ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE)
+        else:
+            ckpt_file = pretrained_model_name_or_path
+
+        with open(ckpt_file, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+        return model
+
+    def save_pretrained(self,
+                        save_directory: Union[str, os.PathLike],
+                        state_dict: Optional[dict] = None,
+                        ):
+
+        model = self
+
+        if state_dict is None:
+            state_dict = model.state_dict()
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # save state dict
+        model_file = os.path.join(save_directory, MODEL_FILE)
+        torch.save(state_dict, model_file)
+
+        # save config
+        config_file = os.path.join(save_directory, CONFIG_FILE)
+        self.config.to_yaml_file(config_file)
+        return save_directory
+
+
+class WaveClassifierExport(WaveClassifierPretrainedModel):
+    def __init__(self, config: WaveClassifierConfig):
+        super(WaveClassifierExport, self).__init__(config=config)
+
+    def forward(self,
+                inputs: torch.Tensor,
+                h: torch.Tensor = None,
+                c: torch.Tensor = None,
+                ):
+        # x: [b, num_samples]
+        x = inputs
+
+        with torch.no_grad():
+            x = self.wave_encoder.stft.forward(x)
+            # shape = [b, freq_bins, t]
+            x = x.transpose(1, 2)
+            # shape = [b, t, freq_bins]
+            x = self.wave_encoder.mel_scale.mel_scale(x)
+            # shape = [b, t, mel_bins]
+            spec = x + 1e-6
+            spec = spec.log()
+        # spec shape = [b, t, f]
+        features, h, c = self.wave_encoder.lstm_layer.forward(spec, h=h, c=c)
+        # features: shape, [b, t, hidden_size]
+        # h: shape, [num_layers, b, hidden_size]
+        # c: shape, [num_layers, b, hidden_size]
+
+        # features shape: [b, t, f]
+        feature = self.pooling_layer.forward(features)
+        # features shape: [b, f]
+        logits = self.cls_head.forward(feature)
+        # logits shape: [batch_size, num_classes]
+        return logits, h, c
+
+
+def main():
+    config = WaveClassifierConfig.from_pretrained("examples/lstm_classifier.yaml")
+    model = WaveClassifierPretrainedModel(config)
+    model_export = WaveClassifierExport(config)
+    model.eval()
+    model_export.eval()
+
+    inputs = torch.rand(size=(1, 16000), dtype=torch.float32)
+
+    logits = model.forward(inputs)
+    print(logits)
+
+    logits, h, c = model_export.forward(inputs)
+
+    return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/toolbox/torchaudio/modules/__init__.py b/toolbox/torchaudio/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..81a66fc40cec5e1bad20c94ebc03002f9772eb07
--- /dev/null
+++ b/toolbox/torchaudio/modules/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/torchaudio/modules/conv_stft.py b/toolbox/torchaudio/modules/conv_stft.py
new file mode 100644
index 0000000000000000000000000000000000000000..8702596edf527a94deb8a121b46299d04aee8d1c
--- /dev/null
+++ b/toolbox/torchaudio/modules/conv_stft.py
@@ -0,0 +1,271 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/modelscope/modelscope/blob/master/modelscope/models/audio/ans/conv_stft.py
+"""
+from collections import defaultdict
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scipy.signal import get_window
+
+
+def init_kernels(nfft: int, win_size: int, hop_size: int, win_type: str = None, inverse=False):
+    if win_type == "None" or win_type is None:
+        window = np.ones(win_size)
+    else:
+        window = get_window(win_type, win_size, fftbins=True)**0.5
+
+    fourier_basis = np.fft.rfft(np.eye(nfft))[:win_size]
+    real_kernel = np.real(fourier_basis)
+    image_kernel = np.imag(fourier_basis)
+    kernel = np.concatenate([real_kernel, image_kernel], 1).T
+
+    if inverse:
+        kernel = np.linalg.pinv(kernel).T
+
+    kernel = kernel * window
+    kernel = kernel[:, None, :]
+    result = (
+        torch.from_numpy(kernel.astype(np.float32)),
+        torch.from_numpy(window[None, :, None].astype(np.float32))
+    )
+    return result
+
+
+class ConvSTFT(nn.Module):
+
+    def __init__(self,
+                 nfft: int,
+                 win_size: int,
+                 hop_size: int,
+                 win_type: str = "hamming",
+                 power: int = None,
+                 requires_grad: bool = False):
+        super(ConvSTFT, self).__init__()
+
+        if nfft is None:
+            self.nfft = int(2**np.ceil(np.log2(win_size)))
+        else:
+            self.nfft = nfft
+
+        kernel, _ = init_kernels(self.nfft, win_size, hop_size, win_type)
+        self.weight = nn.Parameter(kernel, requires_grad=requires_grad)
+
+        self.win_size = win_size
+        self.hop_size = hop_size
+
+        self.stride = hop_size
+        self.dim = self.nfft
+        self.power = power
+
+    def forward(self, waveform: torch.Tensor):
+        if waveform.dim() == 2:
+            waveform = torch.unsqueeze(waveform, 1)
+
+        matrix = F.conv1d(waveform, self.weight, stride=self.stride)
+        dim = self.dim // 2 + 1
+        real = matrix[:, :dim, :]
+        imag = matrix[:, dim:, :]
+        spec = torch.complex(real, imag)
+        # spec shape: [b, f, t], torch.complex64
+
+        if self.power is None:
+            return spec
+        elif self.power == 1:
+            mags = torch.sqrt(real**2 + imag**2)
+            # phase = torch.atan2(imag, real)
+            return mags
+        elif self.power == 2:
+            power = real**2 + imag**2
+            return power
+        else:
+            raise AssertionError
+
+
+class ConviSTFT(nn.Module):
+
+    def __init__(self,
+                 win_size: int,
+                 hop_size: int,
+                 nfft: int = None,
+                 win_type: str = "hamming",
+                 requires_grad: bool = False):
+        super(ConviSTFT, self).__init__()
+        if nfft is None:
+            self.nfft = int(2**np.ceil(np.log2(win_size)))
+        else:
+            self.nfft = nfft
+
+        kernel, window = init_kernels(self.nfft, win_size, hop_size, win_type, inverse=True)
+        self.weight = nn.Parameter(kernel, requires_grad=requires_grad)
+        # weight shape: [f*2, 1, nfft]
+        # f = nfft // 2 + 1
+
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.win_type = win_type
+
+        self.stride = hop_size
+        self.dim = self.nfft
+
+        self.register_buffer("window", window)
+        self.register_buffer("enframe", torch.eye(win_size)[:, None, :])
+        # window shape: [1, nfft, 1]
+        # enframe shape: [nfft, 1, nfft]
+
+    def forward(self,
+                spec: torch.Tensor):
+        """
+        self.weight shape: [f*2, 1, win_size]
+        self.window shape: [1, win_size, 1]
+        self.enframe shape: [win_size, 1, win_size]
+
+        :param spec: torch.Tensor, shape: [b, f, t, 2]
+        :return:
+        """
+        spec = torch.view_as_real(spec)
+        # spec shape: [b, f, t, 2]
+        matrix = torch.concat(tensors=[spec[..., 0], spec[..., 1]], dim=1)
+        # matrix shape: [b, f*2, t]
+
+        waveform = F.conv_transpose1d(matrix, self.weight, stride=self.stride)
+        # waveform shape: [b, 1, num_samples]
+
+        # this is from torch-stft: https://github.com/pseeth/torch-stft
+        t = self.window.repeat(1, 1, matrix.size(-1))**2
+        # t shape: [1, win_size, t]
+        coff = F.conv_transpose1d(t, self.enframe, stride=self.stride)
+        # coff shape: [1, 1, num_samples]
+        waveform = waveform / (coff + 1e-8)
+        # waveform = waveform / coff
+        return waveform
+
+    @torch.no_grad()
+    def forward_chunk(self,
+                      spec: torch.Tensor,
+                      cache_dict: dict = None
+                      ):
+        """
+        :param spec: shape: [b, f, t]
+        :param cache_dict: dict,
+        waveform_cache shape: [b, 1, win_size - hop_size]
+        coff_cache shape: [b, 1, win_size - hop_size]
+        :return:
+        """
+        if cache_dict is None:
+            cache_dict = defaultdict(lambda: None)
+        waveform_cache = cache_dict["waveform_cache"]
+        coff_cache = cache_dict["coff_cache"]
+
+        spec = torch.view_as_real(spec)
+        matrix = torch.concat(tensors=[spec[..., 0], spec[..., 1]], dim=1)
+
+        waveform_current = F.conv_transpose1d(matrix, self.weight, stride=self.stride)
+
+        t = self.window.repeat(1, 1, matrix.size(-1))**2
+        coff_current = F.conv_transpose1d(t, self.enframe, stride=self.stride)
+
+        overlap_size = self.win_size - self.hop_size
+
+        if waveform_cache is not None:
+            waveform_current[:, :, :overlap_size] += waveform_cache
+        waveform_output = waveform_current[:, :, :self.hop_size]
+        new_waveform_cache = waveform_current[:, :, self.hop_size:]
+
+        if coff_cache is not None:
+            coff_current[:, :, :overlap_size] += coff_cache
+        coff_output = coff_current[:, :, :self.hop_size]
+        new_coff_cache = coff_current[:, :, self.hop_size:]
+
+        waveform_output = waveform_output / (coff_output + 1e-8)
+
+        new_cache_dict = {
+            "waveform_cache": new_waveform_cache,
+            "coff_cache": new_coff_cache,
+        }
+        return waveform_output, new_cache_dict
+
+
+def main():
+    nfft = 512
+    win_size = 512
+    hop_size = 256
+
+    stft = ConvSTFT(nfft=nfft, win_size=win_size, hop_size=hop_size, power=None)
+    istft = ConviSTFT(nfft=nfft, win_size=win_size, hop_size=hop_size)
+
+    mixture = torch.rand(size=(1, 16000), dtype=torch.float32)
+    b, num_samples = mixture.shape
+    t = (num_samples - win_size) / hop_size + 1
+
+    spec = stft.forward(mixture)
+    b, f, t = spec.shape
+
+    # 如果 spec 是由 stft 变换得来的，以下两种 waveform 还原方法就是一致的，否则还原出的 waveform 会有差异。
+    # spec = spec + 0.01 * torch.randn(size=(1, nfft//2+1, t), dtype=torch.float32)
+    print(f"spec.shape: {spec.shape}, spec.dtype: {spec.dtype}")
+
+    waveform = istft.forward(spec)
+    # shape: [batch_size, channels, num_samples]
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+
+    waveform = torch.zeros(size=(b, 1, num_samples), dtype=torch.float32)
+    for i in range(int(t)):
+        begin = i * hop_size
+        end = begin + win_size
+        sub_spec = spec[:, :, i:i+1]
+        sub_waveform = istft.forward(sub_spec)
+        # (b, 1, win_size)
+        waveform[:, :, begin:end] = sub_waveform
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+
+    return
+
+
+def main2():
+    nfft = 512
+    win_size = 512
+    hop_size = 256
+
+    stft = ConvSTFT(nfft=nfft, win_size=win_size, hop_size=hop_size, power=None)
+    istft = ConviSTFT(nfft=nfft, win_size=win_size, hop_size=hop_size)
+
+    mixture = torch.rand(size=(1, 16128), dtype=torch.float32)
+    b, num_samples = mixture.shape
+
+    spec = stft.forward(mixture)
+    b, f, t = spec.shape
+
+    # 如果 spec 是由 stft 变换得来的，以下两种 waveform 还原方法就是一致的，否则还原出的 waveform 会有差异。
+    spec = spec + 0.01 * torch.randn(size=(1, nfft//2+1, t), dtype=torch.float32)
+    print(f"spec.shape: {spec.shape}, spec.dtype: {spec.dtype}")
+
+    waveform = istft.forward(spec)
+    # shape: [batch_size, channels, num_samples]
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+
+    cache_dict = None
+    waveform = torch.zeros(size=(b, 1, num_samples), dtype=torch.float32)
+    for i in range(int(t)):
+        sub_spec = spec[:, :, i:i+1]
+        begin = i * hop_size
+
+        end = begin + win_size - hop_size
+        sub_waveform, cache_dict = istft.forward_chunk(sub_spec, cache_dict=cache_dict)
+        # end = begin + win_size
+        # sub_waveform = istft.forward(sub_spec)
+
+        waveform[:, :, begin:end] = sub_waveform
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+
+    return
+
+
+if __name__ == "__main__":
+    main2()
diff --git a/toolbox/torchaudio/modules/freq_bands/__init__.py b/toolbox/torchaudio/modules/freq_bands/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..81a66fc40cec5e1bad20c94ebc03002f9772eb07
--- /dev/null
+++ b/toolbox/torchaudio/modules/freq_bands/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/torchaudio/modules/freq_bands/erb_bands.py b/toolbox/torchaudio/modules/freq_bands/erb_bands.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c94da702b7f6deaba6fbb7c21675fc2e726e861
--- /dev/null
+++ b/toolbox/torchaudio/modules/freq_bands/erb_bands.py
@@ -0,0 +1,176 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+class ErbBandsNumpy(object):
+
+    @staticmethod
+    def freq2erb(freq_hz: float) -> float:
+        """
+        https://www.cnblogs.com/LXP-Never/p/16011229.html
+        1 / (24.7 * 9.265) = 0.00436976
+        """
+        return 9.265 * math.log(freq_hz / (24.7 * 9.265) + 1)
+
+    @staticmethod
+    def erb2freq(n_erb: float) -> float:
+        return 24.7 * 9.265 * (math.exp(n_erb / 9.265) - 1)
+
+    @classmethod
+    def get_erb_widths(cls, sample_rate: int, nfft: int, erb_bins: int, min_freq_bins_for_erb: int) -> np.ndarray:
+        """
+        https://github.com/Rikorose/DeepFilterNet/blob/main/libDF/src/lib.rs
+        :param sample_rate:
+        :param nfft:
+        :param erb_bins: erb (Equivalent Rectangular Bandwidth) 等效矩形带宽的通道数.
+        :param min_freq_bins_for_erb: Minimum number of frequency bands per erb band
+        :return:
+        """
+        nyq_freq = sample_rate / 2.
+        freq_width: float = sample_rate / nfft
+
+        min_erb: float = cls.freq2erb(0.)
+        max_erb: float = cls.freq2erb(nyq_freq)
+
+        erb = [0] * erb_bins
+        step = (max_erb - min_erb) / erb_bins
+
+        prev_freq_bin = 0
+        freq_over = 0
+        for i in range(1, erb_bins + 1):
+            f = cls.erb2freq(min_erb + i * step)
+            freq_bin = int(round(f / freq_width))
+            freq_bins = freq_bin - prev_freq_bin - freq_over
+
+            if freq_bins < min_freq_bins_for_erb:
+                freq_over = min_freq_bins_for_erb - freq_bins
+                freq_bins = min_freq_bins_for_erb
+            else:
+                freq_over = 0
+            erb[i - 1] = freq_bins
+            prev_freq_bin = freq_bin
+
+        erb[erb_bins - 1] += 1
+        too_large = sum(erb) - (nfft / 2 + 1)
+        if too_large > 0:
+            erb[erb_bins - 1] -= too_large
+        return np.array(erb, dtype=np.uint64)
+
+    @staticmethod
+    def get_erb_filter_bank(erb_widths: np.ndarray,
+                            normalized: bool = True,
+                            inverse: bool = False,
+                            ):
+        num_freq_bins = int(np.sum(erb_widths))
+        num_erb_bins = len(erb_widths)
+
+        fb: np.ndarray = np.zeros(shape=(num_freq_bins, num_erb_bins))
+
+        points = np.cumsum([0] + erb_widths.tolist()).astype(int)[:-1]
+        for i, (b, w) in enumerate(zip(points.tolist(), erb_widths.tolist())):
+            fb[b: b + w, i] = 1
+
+        if inverse:
+            fb = fb.T
+            if not normalized:
+                fb /= np.sum(fb, axis=1, keepdims=True)
+        else:
+            if normalized:
+                fb /= np.sum(fb, axis=0)
+        return fb
+
+    @staticmethod
+    def spec2erb(spec: np.ndarray, erb_fb: np.ndarray, db: bool = True):
+        """
+        ERB filterbank and transform to decibel scale.
+
+        :param spec: Spectrum of shape [B, C, T, F].
+        :param erb_fb: ERB filterbank array of shape [B] containing the ERB widths,
+                where B are the number of ERB bins.
+        :param db: Whether to transform the output into decibel scale. Defaults to `True`.
+        :return:
+        """
+        # complex spec to power spec. (real * real + image * image)
+        spec_ = np.abs(spec) ** 2
+
+        # spec to erb feature.
+        erb_feat = np.matmul(spec_, erb_fb)
+
+        if db:
+            erb_feat = 10 * np.log10(erb_feat + 1e-10)
+
+        erb_feat = np.array(erb_feat, dtype=np.float32)
+        return erb_feat
+
+
+class ErbBands(nn.Module):
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 nfft: int = 512,
+                 erb_bins: int = 32,
+                 min_freq_bins_for_erb: int = 2,
+                 ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.nfft = nfft
+        self.erb_bins = erb_bins
+        self.min_freq_bins_for_erb = min_freq_bins_for_erb
+
+        erb_fb, erb_fb_inv = self.init_erb_fb()
+        erb_fb = torch.tensor(erb_fb, dtype=torch.float32, requires_grad=False)
+        erb_fb_inv = torch.tensor(erb_fb_inv, dtype=torch.float32, requires_grad=False)
+        self.erb_fb = nn.Parameter(erb_fb, requires_grad=False)
+        self.erb_fb_inv = nn.Parameter(erb_fb_inv, requires_grad=False)
+
+    def init_erb_fb(self):
+        erb_widths = ErbBandsNumpy.get_erb_widths(
+            sample_rate=self.sample_rate,
+            nfft=self.nfft,
+            erb_bins=self.erb_bins,
+            min_freq_bins_for_erb=self.min_freq_bins_for_erb,
+        )
+        erb_fb = ErbBandsNumpy.get_erb_filter_bank(
+            erb_widths=erb_widths,
+            normalized=True,
+            inverse=False,
+        )
+        erb_fb_inv = ErbBandsNumpy.get_erb_filter_bank(
+            erb_widths=erb_widths,
+            normalized=True,
+            inverse=True,
+        )
+        return erb_fb, erb_fb_inv
+
+    def erb_scale(self, spec: torch.Tensor, db: bool = True):
+        # spec shape: (b, t, f)
+        spec_erb = torch.matmul(spec, self.erb_fb)
+        if db:
+            spec_erb = 10 * torch.log10(spec_erb + 1e-10)
+        return spec_erb
+
+    def erb_scale_inv(self, spec_erb: torch.Tensor):
+        spec = torch.matmul(spec_erb, self.erb_fb_inv)
+        return spec
+
+
+def main():
+
+    erb_bands = ErbBands()
+
+    spec = torch.randn(size=(2, 199, 257), dtype=torch.float32)
+    spec_erb = erb_bands.erb_scale(spec)
+    print(spec_erb.shape)
+
+    spec = erb_bands.erb_scale_inv(spec_erb)
+    print(spec.shape)
+
+    return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/toolbox/torchaudio/modules/freq_bands/mel_bands.py b/toolbox/torchaudio/modules/freq_bands/mel_bands.py
new file mode 100644
index 0000000000000000000000000000000000000000..004819adb5000427f23d21e7ceeefb971dd1f5e2
--- /dev/null
+++ b/toolbox/torchaudio/modules/freq_bands/mel_bands.py
@@ -0,0 +1,113 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+
+
+class MelBandsNumpy(object):
+    @staticmethod
+    def freq2mel(freq_hz):
+        return 2595.0 * np.log10(1.0 + freq_hz / 700.0)
+
+    @staticmethod
+    def mel2freq(mel):
+        return 700.0 * (10.0 ** (mel / 2595.0) - 1.0)
+
+    @classmethod
+    def get_mel_points(cls,
+                       n_mels: int,
+                       f_min: float,
+                       f_max: float,
+                       ):
+        mel_min = cls.freq2mel(f_min)
+        mel_max = cls.freq2mel(f_max)
+        mel_points = np.linspace(mel_min, mel_max, n_mels + 2)
+
+        freq_points = cls.mel2freq(mel_points)
+        return freq_points
+
+    @classmethod
+    def get_mel_filter_bank(cls,
+                            n_fft: int,
+                            n_mels: int,
+                            sample_rate: int,
+                            f_min: float = 0.0,
+                            f_max: float = None,
+                            ):
+        if f_max is None:
+            f_max = float(sample_rate) / 2.0
+
+        n_freqs = n_fft // 2 + 1
+
+        freq_points = cls.get_mel_points(n_mels=n_mels,
+                                         f_min=f_min,
+                                         f_max=f_max)
+
+        # freq bin 索引
+        fft_freqs = np.linspace(0, sample_rate / 2, n_freqs)
+
+        # 构造三角滤波组
+        mel_fb = np.zeros((n_freqs, n_mels), dtype=np.float32)
+        for m in range(n_mels):
+            f_left = freq_points[m]
+            f_center = freq_points[m + 1]
+            f_right = freq_points[m + 2]
+
+            # 上升带
+            left_slope = (fft_freqs - f_left) / (f_center - f_left)
+            # 下降带
+            right_slope = (f_right - fft_freqs) / (f_right - f_center)
+
+            mel_fb[:, m] = np.maximum(0.0, np.minimum(left_slope, right_slope))
+
+        # 归一化
+        mel_fb /= np.maximum(mel_fb.sum(axis=0, keepdims=True), 1e-10)
+        return torch.from_numpy(mel_fb)
+
+
+class MelBands(nn.Module):
+    def __init__(
+        self,
+        n_fft: int = 512,
+        n_mels: int = 64,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: float = None,
+    ):
+        super().__init__()
+        self.n_fft = n_fft
+        self.n_mels = n_mels
+        self.sample_rate = sample_rate
+        self.f_min = f_min
+        self.f_max = f_max
+
+        # mel_fb shape: [freq_bins, mel_bins]
+        mel_fb = MelBandsNumpy.get_mel_filter_bank(
+            n_fft=self.n_fft,
+            n_mels=self.n_mels,
+            sample_rate=self.sample_rate,
+            f_min=self.f_min,
+            f_max=self.f_max,
+        )
+        self.register_buffer("mel_fb", mel_fb)
+
+    def mel_scale(self, spec: torch.Tensor) -> torch.Tensor:
+        # spec shape: (b, t, freq_bins)
+        mel_out = torch.matmul(spec, self.mel_fb)
+        # mel_out shape: (b, t, mel_bins)
+        return mel_out
+
+
+def main():
+    spec = torch.randn(2, 199, 257)  # (batch, time, freq_bins)
+    mel_layer = MelBands(n_fft=512, n_mels=80, sample_rate=16000)
+    mel_feat = mel_layer.mel_scale(spec)  # (2, 199, 80)
+    print(mel_feat.shape)
+    return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/toolbox/torchaudio/modules/utils/__init__.py b/toolbox/torchaudio/modules/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..81a66fc40cec5e1bad20c94ebc03002f9772eb07
--- /dev/null
+++ b/toolbox/torchaudio/modules/utils/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+if __name__ == "__main__":
+    pass
diff --git a/toolbox/torchaudio/modules/utils/ema.py b/toolbox/torchaudio/modules/utils/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..8829990465bd46bbf6c3f49b178f9d338bfc6190
--- /dev/null
+++ b/toolbox/torchaudio/modules/utils/ema.py
@@ -0,0 +1,203 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+class EMANumpy(object):
+
+    @classmethod
+    def _calculate_norm_alpha(cls, sample_rate: int, hop_size: int, tau: float):
+        """Exponential decay factor alpha for a given tau (decay window size [s])."""
+        dt = hop_size / sample_rate
+        result = math.exp(-dt / tau)
+        return result
+
+    @classmethod
+    def get_norm_alpha(cls, sample_rate: int, hop_size: int, norm_tau: float) -> float:
+        a_ = cls._calculate_norm_alpha(sample_rate=sample_rate, hop_size=hop_size, tau=norm_tau)
+
+        precision = 3
+        a = 1.0
+        while a >= 1.0:
+            a = round(a_, precision)
+            precision += 1
+
+        return a
+
+
+class ErbEMA(nn.Module, EMANumpy):
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 hop_size: int = 80,
+                 erb_bins: int = 32,
+                 mean_norm_init_start: float = -60.,
+                 mean_norm_init_end: float = -90.,
+                 norm_tau: float = 1.,
+                 ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.hop_size = hop_size
+        self.erb_bins = erb_bins
+        self.mean_norm_init_start = mean_norm_init_start
+        self.mean_norm_init_end = mean_norm_init_end
+        self.norm_tau = norm_tau
+
+        self.alpha = self.get_norm_alpha(sample_rate, hop_size, norm_tau)
+
+    def make_erb_norm_state(self) -> torch.Tensor:
+        state = torch.linspace(start=self.mean_norm_init_start, end=self.mean_norm_init_end,
+                               steps=self.erb_bins)
+        state = state.unsqueeze(0).unsqueeze(0)
+        # state shape: [b, c, erb_bins]
+        # state shape: [1, 1, erb_bins]
+        return state
+
+    def norm(self,
+             feat_erb: torch.Tensor,
+             state: torch.Tensor = None,
+             ):
+        feat_erb = feat_erb.clone()
+        b, c, t, f = feat_erb.shape
+
+        # erb_feat shape: [b, c, t, f]
+        if state is None:
+            state = self.make_erb_norm_state()
+            state = state.to(feat_erb.device)
+        state = state.clone()
+
+        for j in range(t):
+            current = feat_erb[:, :, j, :]
+            new_state = current * (1 - self.alpha) + state * self.alpha
+
+            feat_erb[:, :, j, :] = (current - new_state) / 40.0
+            state = new_state
+
+        return feat_erb, state
+
+
+class SpecEMA(nn.Module, EMANumpy):
+    """
+    https://github.com/grazder/DeepFilterNet/blob/torchDF_main/libDF/src/lib.rs
+    """
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 hop_size: int = 80,
+                 df_bins: int = 96,
+                 unit_norm_init_start: float = 0.001,
+                 unit_norm_init_end: float = 0.0001,
+                 norm_tau: float = 1.,
+                 ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.hop_size = hop_size
+        self.df_bins = df_bins
+        self.unit_norm_init_start = unit_norm_init_start
+        self.unit_norm_init_end = unit_norm_init_end
+        self.norm_tau = norm_tau
+
+        self.alpha = self.get_norm_alpha(sample_rate, hop_size, norm_tau)
+
+    def make_spec_norm_state(self) -> torch.Tensor:
+        state = torch.linspace(start=self.unit_norm_init_start, end=self.unit_norm_init_end,
+                               steps=self.df_bins)
+        state = state.unsqueeze(0).unsqueeze(0)
+        # state shape: [b, c, df_bins]
+        # state shape: [1, 1, df_bins]
+        return state
+
+    def norm(self,
+             feat_spec: torch.Tensor,
+             state: torch.Tensor = None,
+             ):
+        feat_spec = feat_spec.clone()
+        b, c, t, f = feat_spec.shape
+
+        # feat_spec shape: [b, 2, t, df_bins]
+        if state is None:
+            state = self.make_spec_norm_state()
+            state = state.to(feat_spec.device)
+        state = state.clone()
+
+        for j in range(t):
+            current = feat_spec[:, :, j, :]
+            current_abs = torch.sum(torch.square(current), dim=1, keepdim=True)
+            # current_abs shape: [b, 1, df_bins]
+            new_state = current_abs * (1 - self.alpha) + state * self.alpha
+
+            feat_spec[:, :, j, :] = current / torch.sqrt(new_state)
+            state = new_state
+
+        return feat_spec, state
+
+
+MEAN_NORM_INIT = [-60., -90.]
+
+
+def make_erb_norm_state(erb_bins: int, channels: int) -> np.ndarray:
+    state = np.linspace(MEAN_NORM_INIT[0], MEAN_NORM_INIT[1], erb_bins)
+    state = np.expand_dims(state, axis=0)
+    state = np.repeat(state, channels, axis=0)
+
+    # state shape: (audio_channels, erb_bins)
+    return state
+
+
+def erb_normalize(erb_feat: np.ndarray, alpha: float, state: np.ndarray = None):
+    erb_feat = np.copy(erb_feat)
+    batch_size, time_steps, erb_bins = erb_feat.shape
+
+    if state is None:
+        state = make_erb_norm_state(erb_bins, erb_feat.shape[0])
+        # state = np.linspace(MEAN_NORM_INIT[0], MEAN_NORM_INIT[1], erb_bins)
+        # state = np.expand_dims(state, axis=0)
+        # state = np.repeat(state, erb_feat.shape[0], axis=0)
+
+    for i in range(batch_size):
+        for j in range(time_steps):
+            for k in range(erb_bins):
+                x = erb_feat[i][j][k]
+                s = state[i][k]
+
+                state[i][k] = x * (1. - alpha) + s * alpha
+                erb_feat[i][j][k] -= state[i][k]
+                erb_feat[i][j][k] /= 40.
+
+    return erb_feat
+
+
+UNIT_NORM_INIT = [0.001, 0.0001]
+
+
+def make_spec_norm_state(df_bins: int, channels: int) -> np.ndarray:
+    state = np.linspace(UNIT_NORM_INIT[0], UNIT_NORM_INIT[1], df_bins)
+    state = np.expand_dims(state, axis=0)
+    state = np.repeat(state, channels, axis=0)
+
+    # state shape: (audio_channels, df_bins)
+    return state
+
+
+def spec_normalize(spec_feat: np.ndarray, alpha: float, state: np.ndarray = None):
+    spec_feat = np.copy(spec_feat)
+    batch_size, time_steps, df_bins = spec_feat.shape
+
+    if state is None:
+        state = make_spec_norm_state(df_bins, spec_feat.shape[0])
+
+    for i in range(batch_size):
+        for j in range(time_steps):
+            for k in range(df_bins):
+                x = spec_feat[i][j][k]
+                s = state[i][k]
+
+                state[i][k] = np.abs(x) * (1. - alpha) + s * alpha
+                spec_feat[i][j][k] /= np.sqrt(state[i][k])
+    return spec_feat
+
+
+if __name__ == "__main__":
+    pass