loadingy commited on Nov 25, 2025

Commit

51be264

0 Parent(s):

first push

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.gitignore +186 -0
Dockerfile +55 -0
LICENSE +201 -0
README.md +34 -0
c2cite.py +300 -0
c2cite/__init__.py +52 -0
c2cite/adapters/__init__.py +104 -0
c2cite/adapters/loramoe/__init__.py +7 -0
c2cite/adapters/loramoe/config.py +42 -0
c2cite/adapters/loramoe/model.py +62 -0
c2cite/adapters/mixlora/__init__.py +19 -0
c2cite/adapters/mixlora/config.py +144 -0
c2cite/adapters/mixlora/model.py +610 -0
c2cite/adapters/mola/__init__.py +8 -0
c2cite/adapters/mola/config.py +57 -0
c2cite/adapters/mola/model.py +159 -0
c2cite/common/__init__.py +92 -0
c2cite/common/abstracts.py +194 -0
c2cite/common/attention.py +293 -0
c2cite/common/cache.py +554 -0
c2cite/common/checkpoint.py +33 -0
c2cite/common/config.py +234 -0
c2cite/common/feed_forward.py +70 -0
c2cite/common/lora_linear.py +511 -0
c2cite/common/moe_utils.py +57 -0
c2cite/common/rope.py +88 -0
c2cite/dispatcher.py +378 -0
c2cite/evaluator.py +518 -0
c2cite/executors/__init__.py +54 -0
c2cite/executors/common.py +77 -0
c2cite/executors/cpu.py +51 -0
c2cite/executors/cuda.py +53 -0
c2cite/executors/mps.py +71 -0
c2cite/generator.py +669 -0
c2cite/model.py +1039 -0
c2cite/models/__init__.py +40 -0
c2cite/models/modeling_chatglm.py +855 -0
c2cite/models/modeling_gemma.py +131 -0
c2cite/models/modeling_gemma2.py +528 -0
c2cite/models/modeling_llama.py +579 -0
c2cite/models/modeling_mistral.py +255 -0
c2cite/models/modeling_phi.py +576 -0
c2cite/models/modeling_phi3.py +581 -0
c2cite/prompter.py +63 -0
c2cite/solutions.py +9 -0
c2cite/tasks/__init__.py +29 -0
c2cite/tasks/attribute_tasks.py +567 -0
c2cite/tasks/common.py +1045 -0
c2cite/tasks/glue_tasks.py +90 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ paper_wsdm_c2cite.pdf filter=lfs diff=lfs merge=lfs -text
2	+ *.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,186 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# IDEs
+.vscode/
+# MoE-PEFT
+__pycache__/
+*.egg-info/
+*.egg
+moe_peft.json
+moe_peft_train_*.json
+# macOS junk files
+.DS_Store
+# PEFT adapters
+adapter_model.bin
+adapter_config.json
+result/
+checkpoints/
+cases/
+dataset/
+tblogs/
+*.png
+*.svg
+logs

Dockerfile ADDED Viewed

	@@ -0,0 +1,55 @@

+FROM nvidia/cuda:12.5.1-devel-ubuntu22.04
+ARG PYTHON_VERSION=3.11
+ARG http_proxy
+ARG https_proxy
+RUN apt-get update
+RUN apt-get install -y \
+    locales \
+    build-essential \
+    git \
+    git-lfs \
+    vim \
+    cmake \
+    pkg-config \
+    zlib1g-dev libncurses5-dev \
+    libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev wget \
+    liblzma-dev libsqlite3-dev libbz2-dev
+RUN apt-get clean
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US:en
+ENV LC_ALL=en_US.UTF-8
+RUN sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen
+ENV PYENV_ROOT=/root/.pyenv
+ENV PATH="$PYENV_ROOT/bin/:$PATH"
+RUN /usr/bin/echo -e '#!/bin/bash\neval "$(pyenv init -)"\neval "$(pyenv virtualenv-init -)"\ncd /moe_peft\nbash' | tee /opt/init.sh \
+    && chmod +x /opt/init.sh \
+    && /usr/bin/echo -e 'export PYENV_ROOT=/root/.pyenv' >> ~/.bashrc \
+    && /usr/bin/echo -e 'export PATH=/root/.pyenv/bin:$PATH' >> ~/.bashrc \
+    && /usr/bin/echo -e 'eval "$(pyenv init -)"' >> ~/.bashrc \
+    && /usr/bin/echo -e 'eval "$(pyenv virtualenv-init -)"' >> ~/.bashrc \
+    && git clone https://github.com/pyenv/pyenv.git /root/.pyenv \
+    && git clone https://github.com/pyenv/pyenv-virtualenv.git /root/.pyenv/plugins/pyenv-virtualenv \
+    && cd /root/.pyenv && src/configure && make -C src \
+    && eval "$(pyenv init -)" \
+    && eval "$(pyenv virtualenv-init -)"
+RUN . ~/.bashrc \
+    && pyenv install $PYTHON_VERSION \
+    && pyenv global $PYTHON_VERSION \
+    && git clone https://github.com/TUDB-Labs/MoE-PEFT /moe_peft \
+    && cd /moe_peft \
+    && pyenv virtualenv $PYTHON_VERSION moe_peft \
+    && pyenv local moe_peft \
+    && pip install -r ./requirements.txt --upgrade --no-compile --no-cache-dir
+WORKDIR /moe_peft
+CMD ["/bin/bash", "/opt/init.sh"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,34 @@

+This repository contains the code for the paper “C$^2$-Cite: Contextual-Aware Citation Generation for \\ Attributed Large Language Models”. The project is based on the open-source repository"[TUDB-Labs/MoE-PEFT](https://github.com/TUDB-Labs/MoE-PEFT)". C$^2$-Cite is a model that can answer the questions with citation markers.
+## File description
+- **config**: Including the configurations of training or evaluating
+- **c2cite/backends**: Some backend tools for GMoE.
+- **c2cite/common**: The implementation of Transformer architecture.
+- **c2cite/models**: The implementation of some series of Transformer-based models.
+- **c2cite/tasks**: The implementation of datasets.
+- **c2cite.py** The start file of this project.
+## Environment Requirements
+ - python3=3.11
+ - pytorch >= 2.1.2
+ - Other dependencies, See ```requirements.txt```
+## Quick Start
+### STEP 1: Download Base models
+ - [Llama-3-8B-inst]
+### STEP 2: Downlaod training datasets
+To get Training dataset proposed in paper "Towards Faithful and Robust LLM Specialists for Evidence-Based Question-Answering", you can download [SynSciQA](https://github.com/EdisonNi-hku/Robust_Evidence_Based_QA) here. And please put SynSciQA.json, SynSciQA+.json, SynSciQA++.json in ./dataset/SynSciQA
+### STEP 3: Download evaluation datasets
+We evaluate our model and baselines using [ALCE](https://github.com/princeton-nlp/ALCE). To get Evaluate datasets, please run
+```bash
+bash download_test_data.sh
+```
+### STEP 4: Start training
+Replace the **[base model]** and the **[train/evaluate config]** below with the directory of base model and the configuration in Folder "config".
+``````python
+python c2cite.py --dir ./checkpoint --log_file ./logs --verbose --seed 42 --attn_impl eager --base_model [base model] --config [train/evaluate config] --device cuda:0
+``````
+### STEP 5: Conduct evaluation
+After training process, we can conduct the evaluation step with the command below:
+``````python
+python c2cite.py --dir ./checkpoint --log_file ./logs --verbose --seed 42 --attn_impl eager --base_model [base model] --config [train/evaluate config] --device cuda:0 --evaluate
+``````
+***Note***:   **Do not** change the information in the **train config** after training step, or it won't find the right adapter.

c2cite.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import argparse
+import json
+import logging
+import os
+import sys
+from typing import Dict, List, Tuple, Union
+import torch
+from transformers.utils import is_flash_attn_2_available
+import moe_peft
+import moe_peft.adapters
+# Command Line Arguments
+parser = argparse.ArgumentParser(description="MoE-PEFT main program")
+parser.add_argument(
+    "--base_model", type=str, required=True, help="Path to or name of base model"
+)
+parser.add_argument(
+    "--inference", action="store_true", help="The inference mode (just for test)"
+)
+parser.add_argument(
+    "--evaluate", action="store_true", help="The evaluate mode (just for test)"
+)
+parser.add_argument(
+    "--disable_prompter", action="store_true", help="Disable prompter when inference"
+)
+parser.add_argument(
+    "--load_adapter",
+    action="store_true",
+    help="Load adapter from file instead of init randomly",
+)
+parser.add_argument(
+    "--disable_adapter", action="store_true", help="Disable the adapter modules"
+)
+parser.add_argument(
+    "--attn_impl", type=str, help="Specify the implementation of attention"
+)
+parser.add_argument(
+    "--sliding_window",
+    action="store_true",
+    help="Use sliding window attention (requires flash attention)",
+)
+parser.add_argument(
+    "--disable_cache",
+    action="store_true",
+    help="Disable cache when inference",
+)
+parser.add_argument(
+    "--cache_implementation",
+    type=str,
+    help="Specify the implementation of cache",
+)
+parser.add_argument(
+    "--fp16", action="store_true", help="Load base model in float16 precision"
+)
+parser.add_argument(
+    "--bf16", action="store_true", help="Load base model in bfloat16 precision"
+)
+parser.add_argument(
+    "--tf32", action="store_true", help="Use tfloat32 instead of float32 if available"
+)
+parser.add_argument(
+    "--load_8bit", action="store_true", help="Load base model with 8bit quantization"
+)
+parser.add_argument(
+    "--load_4bit", action="store_true", help="Load base model with 4bit quantization"
+)
+parser.add_argument("--device", type=str, help="Specify which GPU to be used")
+parser.add_argument(
+    "--config", type=str, required=True, help="Path to finetune configuration"
+)
+parser.add_argument(
+    "--seed", type=int, default=42, help="Random seed in integer, default is 42"
+)
+parser.add_argument(
+    "--dir", type=str, default=".", help="Path to read or save checkpoints"
+)
+parser.add_argument("--disable_log", action="store_true", help="Disable logging")
+parser.add_argument("--log_file", type=str, help="Save log to specific file")
+parser.add_argument(
+    "--verbose", action="store_true", help="Show extra informations such as parameters"
+)
+parser.add_argument(
+    "--overwrite",
+    action="store_true",
+    help="Overwrite adapter model when older one existed",
+)
+parser.add_argument("--debug", action="store_true", help="Enabling debugging mode")
+parser.add_argument(
+    "--deterministic",
+    action="store_true",
+    help="Use deterministic algorithms to improve the reproducibility",
+)
+args = parser.parse_args()
+def query_yes_no(question, default="no"):
+    valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
+    if default is None:
+        prompt = " [y/n] "
+    elif default == "yes":
+        prompt = " [Y/n] "
+    elif default == "no":
+        prompt = " [y/N] "
+    else:
+        raise ValueError("invalid default answer: '%s'" % default)
+    while True:
+        sys.stdout.write(question + prompt)
+        choice = input().lower()
+        if default is not None and choice == "":
+            return valid[default]
+        elif choice in valid:
+            return valid[choice]
+        else:
+            sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
+def load_base_model() -> Tuple[moe_peft.Tokenizer, moe_peft.LLMModel]:
+    logging.info("Initializing pre-trained model.")
+    model = moe_peft.LLMModel.from_pretrained(
+        name_or_path=args.base_model,
+        device=args.device,
+        attn_impl=args.attn_impl,
+        use_sliding_window=args.sliding_window,
+        bits=(8 if args.load_8bit else (4 if args.load_4bit else None)),
+        load_dtype=(
+            torch.bfloat16
+            if args.bf16
+            else (torch.float16 if args.fp16 else torch.float32)
+        ),
+    )
+    tokenizer = moe_peft.Tokenizer(args.base_model)
+    return tokenizer, model
+def init_adapter_config(
+    config: Dict[str, any],
+    llm_model: moe_peft.LLMModel,
+) -> List[Union[moe_peft.GenerateConfig, moe_peft.TrainConfig]]:
+    config_list = []
+    if config["cutoff_len"] == -1:
+        config["cutoff_len"] = llm_model.config_.max_seq_len_
+        logging.info(f"Setting cutoff_len to {llm_model.config_.max_seq_len_} automatically.")
+    for lora_config in config["lora"]:
+        adapter_name = lora_config["name"]
+        adapter_path = f"{args.dir}{os.sep}{adapter_name}"
+        if not args.load_adapter and os.path.exists(adapter_path):
+            if args.overwrite:
+                logging.warning(
+                    f"Overwriting existed adapter model file: {adapter_path}"
+                )
+            elif not query_yes_no(
+                f"Existed adapter model file detected: {adapter_path}\n" + "Overwrite?"
+            ):
+                logging.info("User canceled training due to file conflict.")
+                exit(0)
+        if args.load_adapter:
+            llm_model.load_adapter(adapter_path, adapter_name)
+        else:
+            llm_model.init_adapter(moe_peft.adapters.lora_config_factory(lora_config))
+        if args.inference:
+            config_class = moe_peft.GenerateConfig(adapter_name=adapter_name)
+            if not args.disable_prompter:
+                config_class.prompt_template = lora_config.get("prompt", None)
+            config_list.append(config_class)
+        elif args.evaluate:
+            config_list.extend(moe_peft.EvaluateConfig.from_config(lora_config))
+        else:
+            config_list.append(moe_peft.TrainConfig.from_config(lora_config))
+        if args.verbose:
+            logging.info(config_list[-1].__dict__)
+    return config_list
+def inference_callback(cur_pos, outputs):
+    print(f"POSITION: {cur_pos}")
+    for adapter_name, output in outputs.items():
+        print(f"{adapter_name} OUTPUT: {output[0]}")
+def inference(
+    model: moe_peft.LLMModel,
+    tokenizer: moe_peft.Tokenizer,
+    configs: List[moe_peft.GenerateConfig],
+    concurrent_jobs: int,
+):
+    while True:
+        input_raw = input("INPUT WITHOUT PROMPT: ")
+        if input_raw == "QUIT":
+            return
+        for config in configs:
+            config.prompts = [input_raw]
+        callback = None if args.disable_log else inference_callback
+        outputs = moe_peft.generate(
+            model,
+            tokenizer,
+            configs,
+            max_gen_len=128,
+            use_cache=not args.disable_cache,
+            concurrent_jobs=concurrent_jobs,
+            cache_implementation=args.cache_implementation,
+            stream_callback=callback,
+        )
+        print(f"\n{'='*10}\n")
+        print(f"PROMPT: {input_raw}")
+        for adapter_name, output in outputs.items():
+            print(f"{adapter_name} OUTPUT:")
+            print(output[0])
+        print(f"\n{'='*10}\n")
+# Main Function
+if __name__ == "__main__":
+    if args.debug:
+        torch.autograd.set_detect_anomaly(True)
+    if args.inference or args.evaluate:
+        args.load_adapter = True
+        inference_mode = True
+    else:
+        inference_mode = False
+    #args.load_adapter = False##############################
+    moe_peft.setup_logging("INFO", args.log_file)
+    moe_peft_executor = moe_peft.executor
+    if not moe_peft_executor.check_available():
+        exit(-1)
+    if args.attn_impl is None:
+        if (
+            inference_mode
+            and moe_peft_executor.device_name() == "cuda"
+            and is_flash_attn_2_available()
+        ):
+            args.attn_impl = "flash_attn"
+        else:
+            args.attn_impl = "eager"
+    if args.device is None:
+        args.device = moe_peft.executor.default_device_name()
+    moe_peft_executor.use_deterministic_algorithms(args.deterministic)
+    moe_peft_executor.allow_tf32(args.tf32)
+    moe_peft_executor.manual_seed(args.seed)
+    with open(args.config, "r", encoding="utf8") as fp:
+        config = json.load(fp)
+    tokenizer, model = load_base_model()
+    adapters = init_adapter_config(config, model)
+    moe_peft_executor.empty_cache()
+    if os.getenv("MOE_PEFT_EVALUATE_MODE") is None:
+        logging.info("Using efficient operators.")
+    else:
+        logging.info("Using deterministic operators.")
+    if args.inference:
+        inference(
+            model=model,
+            tokenizer=tokenizer,
+            configs=adapters,
+            concurrent_jobs=config.get("inference_lora_simultaneously_num", 2),
+        )
+    elif args.evaluate:
+        moe_peft.evaluate(
+            model=model,
+            tokenizer=tokenizer,
+            configs=adapters,
+            max_concurrent_jobs=config.get("eval_lora_simultaneously_num", None),
+            retrying_steps=config.get("eval_rollback_retrying_steps", 20),
+            max_seq_len=config["cutoff_len"],
+            save_file=config.get("evaluate_result", None),
+            require_attention = -1,
+            require_hide = -1,
+        )
+    else:
+        moe_peft.train(
+            model=model,
+            tokenizer=tokenizer,
+            configs=adapters,
+            max_concurrent_jobs=config.get("train_lora_simultaneously_num", None),
+            strategy=config["train_strategy"],
+            cutoff_len=config["cutoff_len"],
+            save_step=config["save_step"],
+            save_dir=args.dir,
+        )

c2cite/__init__.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from .common import (
+    AdapterConfig,
+    LLMBatchConfig,
+    LLMCache,
+    LLMForCausalLM,
+    LLMModelConfig,
+    LLMModelInput,
+    LLMModelOutput,
+    LoraConfig,
+    cache_factory,
+)
+from .dispatcher import Dispatcher, TrainTask
+from .evaluator import EvaluateConfig, evaluate
+from .executors import executor
+from .generator import GenerateConfig, generate
+from .model import LLMModel
+from .prompter import Prompter
+from .tokenizer import Tokenizer
+from .trainer import TrainConfig, train
+from .utils import is_package_available, setup_logging
+assert is_package_available("torch", "2.3.0"), "MoE-PEFT requires torch>=2.3.0"
+assert is_package_available(
+    "transformers", "4.43.0"
+), "MoE-PEFT requires transformers>=4.43.0"
+setup_logging()
+__all__ = [
+    "LLMCache",
+    "cache_factory",
+    "LLMModelConfig",
+    "LLMModelOutput",
+    "LLMForCausalLM",
+    "LLMBatchConfig",
+    "LLMModelInput",
+    "AdapterConfig",
+    "LoraConfig",
+    "TrainTask",
+    "Dispatcher",
+    "EvaluateConfig",
+    "evaluate",
+    "GenerateConfig",
+    "generate",
+    "TrainConfig",
+    "train",
+    "LLMModel",
+    "Prompter",
+    "Tokenizer",
+    "setup_logging",
+    "executor",
+]

c2cite/adapters/__init__.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from typing import Dict, Optional, TypeAlias
+import torch
+from moe_peft.common import AdapterConfig, LoraConfig
+from .loramoe import LoraMoe, LoraMoeConfig
+from .mixlora import (
+    DynamicRouterLoss,
+    DynamicSparseMoe,
+    MixLoraConfig,
+    MixtralRouterLoss,
+    MixtralSparseMoe,
+    SwitchRouterLoss,
+    SwitchSparseMoe,
+)
+from .mola import MolaConfig, MolaRouterLoss, MolaSparseMoe
+peft_type_dict = {
+    "LORA": LoraConfig,
+    "MIXLORA": MixLoraConfig,
+    "LORAMOE": LoraMoeConfig,
+    "MOLA": MolaConfig,
+}
+routing_strategy_dict = {
+    "mixlora": MixLoraConfig,
+    "mixlora-dynamic": MixLoraConfig,
+    "mixlora-switch": MixLoraConfig,
+    "loramoe": LoraMoeConfig,
+    "mola": MolaConfig,
+}
+router_loss_dict = {
+    "mixlora": MixtralRouterLoss,
+    "mixlora-dynamic": DynamicRouterLoss,
+    "mixlora-switch": SwitchRouterLoss,
+    "mola": MolaRouterLoss,
+}
+moe_layer_dict = {
+    "mixlora": MixtralSparseMoe,
+    "mixlora-dynamic": DynamicSparseMoe,
+    "mixlora-switch": SwitchSparseMoe,
+    "loramoe": LoraMoe,
+    "mola": MolaSparseMoe,
+}
+def lora_config_factory(config: Dict[str, any]) -> LoraConfig:
+    if peft_type_dict.get(config.get("peft_type", ""), None) is not None:
+        config_class: TypeAlias[AdapterConfig] = peft_type_dict[config["peft_type"]]
+    elif (
+        routing_strategy_dict.get(config.get("routing_strategy", ""), None) is not None
+    ):
+        config_class: TypeAlias[AdapterConfig] = routing_strategy_dict[
+            config["routing_strategy"]
+        ]
+    else:
+        config_class = LoraConfig
+    return config_class.from_config(config).check()
+def router_loss_factory(config: MixLoraConfig) -> torch.nn.Module:
+    if config.routing_strategy_ not in router_loss_dict:
+        return None
+    if config.router_loss_:
+        return router_loss_dict[config.routing_strategy_](config)
+    else:
+        return None
+def moe_layer_factory(
+    in_features: int,
+    device: torch.device,
+    config: MolaConfig,
+    gate: Optional[torch.Tensor] = None,
+) -> torch.nn.Module:
+    if config.routing_strategy_ not in moe_layer_dict:
+        raise ValueError(f"Unknown routing strategy {config.routing_strategy_}")
+    return moe_layer_dict[config.routing_strategy_](in_features, device, config, gate)
+__all__ = [
+    "MixLoraConfig",
+    "MixtralRouterLoss",
+    "MixtralSparseMoe",
+    "DynamicRouterLoss",
+    "DynamicSparseMoe",
+    "SwitchRouterLoss",
+    "SwitchSparseMoe",
+    "LoraMoeConfig",
+    "LoraMoe",
+    "MolaConfig",
+    "MolaSparseMoe",
+    "peft_type_dict",
+    "routing_strategy_dict",
+    "router_loss_dict",
+    "moe_layer_dict",
+    "lora_config_factory",
+    "router_loss_factory",
+    "moe_layer_factory",
+]

c2cite/adapters/loramoe/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .config import LoraMoeConfig
+from .model import LoraMoe
+__all__ = [
+    "LoraMoeConfig",
+    "LoraMoe",
+]

c2cite/adapters/loramoe/config.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import copy
+from dataclasses import dataclass
+from typing import Dict
+from moe_peft.common import LoraConfig
+@dataclass
+class LoraMoeConfig(LoraConfig):
+    num_experts_: int = None
+    router_init_range_: float = None
+    routing_strategy_: str = "loramoe"
+    def check(self) -> "LoraMoeConfig":
+        super().check()
+        assert isinstance(self.num_experts_, int) and self.num_experts_ > 0
+        assert (
+            isinstance(self.router_init_range_, float) and self.router_init_range_ >= 0
+        )
+        return self
+    @staticmethod
+    def from_config(config: Dict[str, any]) -> "LoraMoeConfig":
+        return LoraMoeConfig(
+            num_experts_=config["num_experts"],
+            router_init_range_=config.get("router_init_range", 5.0),
+            **LoraConfig.from_config(config).__dict__,
+        )
+    def export(self) -> Dict[str, any]:
+        config = super().export()
+        config["peft_type"] = "LORAMOE"
+        config["routing_strategy"] = self.routing_strategy_
+        config["num_experts"] = self.num_experts_
+        return config
+    def expert_config(self, expert_idx: int) -> LoraConfig:
+        config = copy.deepcopy(super())
+        config.adapter_name = f"moe.{self.adapter_name}.experts.{expert_idx}"
+        return config

c2cite/adapters/loramoe/model.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+from moe_peft.common import Linear, LLMMoeBlock
+from .config import LoraMoeConfig
+class LoraMoe(LLMMoeBlock):
+    def __init__(
+        self,
+        in_features: int,
+        device: torch.device,
+        config: LoraMoeConfig,
+        gate: Optional[torch.Tensor] = None,
+    ) -> None:
+        super().__init__()
+        self.adapter_name_: str = config.adapter_name
+        self.dtype_: torch.dtype = torch.float32
+        self.gate_ = torch.nn.Linear(
+            in_features,
+            config.num_experts_,
+            bias=False,
+            device=device,
+            dtype=torch.float32,
+        )
+        self.experts_ = config.num_experts_
+        self.router_logits_: torch.Tensor = None
+        if gate is None:
+            torch.nn.init.kaiming_uniform_(
+                self.gate_.weight, a=math.sqrt(config.router_init_range_)
+            )
+        else:
+            with torch.no_grad():
+                self.gate_.weight.copy_(gate)
+    def forward(
+        self,
+        residual: torch.Tensor,
+        hidden_states: torch.Tensor,
+        lora_linear: Optional[Linear] = None,
+    ) -> Tuple:
+        assert lora_linear is not None
+        router_logits = self.gate_(hidden_states.to(self.dtype_))
+        self.router_logits_ = router_logits.reshape(-1, self.experts_)
+        routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float32)
+        for expert_idx in range(self.experts_):
+            expert_lora = lora_linear.loras_[
+                f"moe.{self.adapter_name_}.experts.{expert_idx}"
+            ]
+            residual = residual + (
+                torch.unsqueeze(routing_weights[:, :, expert_idx], -1)
+                * expert_lora.lora_forward(hidden_states)
+            ).to(hidden_states.dtype)
+        return residual

c2cite/adapters/mixlora/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .config import MixLoraConfig
+from .model import (
+    DynamicRouterLoss,
+    DynamicSparseMoe,
+    MixtralRouterLoss,
+    MixtralSparseMoe,
+    SwitchRouterLoss,
+    SwitchSparseMoe,
+)
+__all__ = [
+    "MixLoraConfig",
+    "MixtralRouterLoss",
+    "MixtralSparseMoe",
+    "DynamicRouterLoss",
+    "DynamicSparseMoe",
+    "SwitchRouterLoss",
+    "SwitchSparseMoe",
+]

c2cite/adapters/mixlora/config.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import copy
+from dataclasses import dataclass
+from typing import Dict, Optional, Union
+import torch
+from transformers.activations import ACT2FN
+from moe_peft.common import LoraConfig
+available_routing_strategies = ["mixlora", "mixlora-dynamic", "mixlora-switch"]
+@dataclass
+class MixLoraConfig(LoraConfig):
+    # expert lora
+    expert_config_: LoraConfig = None
+    # router config
+    router_aux_loss_coef_: float = None
+    router_init_range_: float = None
+    routing_strategy_: str = None
+    jitter_noise_: float = None
+    router_loss_: bool = True
+    num_experts_: int = None
+    act_fn_: Optional[Union[str, torch.nn.Module]] = None
+    # mixtral config
+    top_k_: int = None
+    # dynamic config
+    top_p_: float = None
+    temperature_: float = None
+    # switch transformers config
+    router_z_loss_coef_: float = None
+    expert_capacity_: int = None
+    ffn_dropout_: float = None
+    sparse_step_: int = None
+    def check(self) -> "MixLoraConfig":
+        super().check()
+        if self.expert_config_ is not None:
+            self.expert_config_.check()
+        assert (
+            isinstance(self.router_aux_loss_coef_, float)
+            and self.router_aux_loss_coef_ >= 0
+        )
+        assert (
+            isinstance(self.router_init_range_, float) and self.router_init_range_ >= 0
+        )
+        assert (
+            isinstance(self.routing_strategy_, str)
+            and self.routing_strategy_ in available_routing_strategies
+        )
+        assert isinstance(self.jitter_noise_, float) and self.jitter_noise_ >= 0
+        assert isinstance(self.router_loss_, bool)
+        assert isinstance(self.num_experts_, int) and self.num_experts_ > 0
+        assert self.act_fn_ is None or (
+            isinstance(self.act_fn_, str) and self.act_fn_ in ACT2FN
+        )
+        if self.routing_strategy_ == "mixlora":
+            assert isinstance(self.top_k_, int) and self.top_k_ > 0
+        elif self.routing_strategy_ == "mixlora-dynamic":
+            assert (
+                isinstance(self.top_p_, float) and self.top_p_ > 0 and self.top_p_ <= 1
+            )
+            assert isinstance(self.temperature_, float) and self.temperature_ >= 0
+        elif self.routing_strategy_ == "mixlora-switch":
+            assert (
+                isinstance(self.router_z_loss_coef_, float)
+                and self.router_z_loss_coef_ >= 0
+            )
+            if self.sparse_step_ is not None:
+                assert isinstance(self.sparse_step_, int) and self.sparse_step_ > 0
+            assert isinstance(self.expert_capacity_, int) and self.expert_capacity_ > 0
+            assert isinstance(self.ffn_dropout_, float) and self.ffn_dropout_ >= 0
+        return self
+    @staticmethod
+    def from_config(config: Dict[str, any]) -> "MixLoraConfig":
+        lora_config = MixLoraConfig(**LoraConfig.from_config(config).__dict__)
+        if "expert_lora" in config:
+            expert_config = copy.deepcopy(config)
+            expert_config.update(config["expert_lora"])
+            lora_config.expert_config_ = LoraConfig().from_config(expert_config)
+        lora_config.router_aux_loss_coef_ = config.get(
+            "router_aux_loss_coef", 0.001
+        )  # for training
+        lora_config.routing_strategy_ = config["routing_strategy"]
+        lora_config.router_loss_ = config.get("router_loss", True)
+        lora_config.num_experts_ = config["num_experts"]
+        # silu for mixtral or gelu_new for switch transformers
+        # left blank to automatically use the original act_fn of FFN
+        lora_config.act_fn_ = config.get("act_fn", None)
+        if lora_config.routing_strategy_ == "mixlora":
+            lora_config.router_init_range_ = config.get("router_init_range", 0.02)
+            lora_config.jitter_noise_ = config.get("jitter_noise", 0.0)
+            lora_config.top_k_ = config.get("top_k", 2)
+        elif lora_config.routing_strategy_ == "mixlora-dynamic":
+            lora_config.router_init_range_ = config.get("router_init_range", 0.02)
+            lora_config.jitter_noise_ = config.get("jitter_noise", 0.0)
+            lora_config.top_p_ = config.get("top_p", 0.8)
+            lora_config.temperature_ = config.get("temperature", 0.0)
+        elif lora_config.routing_strategy_ == "mixlora-switch":
+            lora_config.router_init_range_ = config.get("router_init_range", 1.0)
+            lora_config.jitter_noise_ = config.get("jitter_noise", 0.01)
+            lora_config.router_z_loss_coef_ = config.get(
+                "router_z_loss_coef", 0.001
+            )  # for training
+            # expert_capacity = (max_sequence_length / num_experts) * capacity_factor
+            # common values of capacity_factor: 1.0, 1.25, 2.0
+            lora_config.expert_capacity_ = config.get("expert_capacity", 32)
+            lora_config.ffn_dropout_ = config.get("ffn_dropout", 0.0)
+            lora_config.sparse_step_ = config.get("sparse_step", None)
+        return lora_config
+    def export(self) -> Dict[str, any]:
+        config = super().export()
+        config["peft_type"] = "MIXLORA"
+        if self.expert_config_ is not None:
+            expert_config = self.expert_config_.export()
+            expert_config.pop("peft_type")
+            expert_config.pop("target_modules")
+            config["expert_lora"] = expert_config
+        config["routing_strategy"] = self.routing_strategy_
+        config["num_experts"] = self.num_experts_
+        if self.act_fn_ is not None and isinstance(self.act_fn_, str):
+            config["act_fn"] = self.act_fn_
+        if self.routing_strategy_ == "mixlora":
+            config["top_k"] = self.top_k_
+        elif self.routing_strategy_ == "mixlora-dynamic":
+            config["top_p"] = self.top_p_
+            config["temperature"] = self.temperature_
+        elif self.routing_strategy_ == "mixlora-switch":
+            config["expert_capacity"] = self.expert_capacity_
+            config["sparse_step"] = self.sparse_step_
+        return config
+    def expert_config(self, expert_idx: int) -> LoraConfig:
+        if self.expert_config_ is None:
+            config = copy.deepcopy(super())
+        else:
+            config = copy.deepcopy(self.expert_config_)
+        config.adapter_name = f"moe.{self.adapter_name}.experts.{expert_idx}"
+        return config

c2cite/adapters/mixlora/model.py ADDED Viewed

	@@ -0,0 +1,610 @@

+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from moe_peft.common import LLMFeedForward, LLMModelInput, LLMMoeBlock, slice_tensor
+from .config import MixLoraConfig
+def _mixlora_compatible_forward(
+    ffn_layer: LLMFeedForward,
+    moe_name: str,
+    act_fn: torch.nn.Module,
+    expert_mask: torch.Tensor,
+    hidden_states: torch.Tensor,
+    input_dtype: torch.device,
+):
+    final_expert_states = []
+    for expert_idx in range(expert_mask.shape[0]):
+        _, top_x = torch.where(expert_mask[expert_idx])
+        lora_name = f"moe.{moe_name}.experts.{expert_idx}"
+        lora_data = slice_tensor(hidden_states, top_x, input_dtype)
+        final_expert_states.append(
+            ffn_layer._lora_forward(lora_name, act_fn, lora_data)
+        )
+    return final_expert_states
+def _mixtral_load_balancing_loss_func(
+    gate_logits: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> float:
+    routing_weights = torch.nn.functional.softmax(gate_logits, dim=-1)
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = routing_weights.shape[0] // (batch_size * sequence_length)
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand(
+                (num_hidden_layers, batch_size, sequence_length, top_k, num_experts)
+            )
+            .reshape(-1, top_k, num_experts)
+            .to(routing_weights.device)
+        )
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(
+            expert_mask.float() * expert_attention_mask, dim=0
+        ) / torch.sum(expert_attention_mask, dim=0)
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(routing_weights.device)
+        )
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(
+            routing_weights * router_per_expert_attention_mask, dim=0
+        ) / torch.sum(router_per_expert_attention_mask, dim=0)
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+class MixtralRouterLoss(torch.nn.Module):
+    def __init__(self, config: MixLoraConfig) -> None:
+        super().__init__()
+        self.aux_loss_coef = config.router_aux_loss_coef_
+        self.experts = config.num_experts_
+        self.topk = config.top_k_
+    def forward(self, gate_logits, attention_mask) -> torch.Tensor:
+        return self.aux_loss_coef * _mixtral_load_balancing_loss_func(
+            gate_logits, self.experts, self.topk, attention_mask
+        )
+class MixtralSparseMoe(LLMMoeBlock):
+    def __init__(
+        self,
+        in_features: int,
+        device: torch.device,
+        config: MixLoraConfig,
+        gate: Optional[torch.Tensor] = None,
+    ) -> None:
+        super().__init__()
+        self.adapter_name_: str = config.adapter_name
+        self.dtype_: torch.dtype = torch.float32
+        self.gate_ = torch.nn.Linear(
+            in_features,
+            config.num_experts_,
+            bias=False,
+            device=device,
+            dtype=self.dtype_,
+        )
+        self.act_ = (
+            ACT2FN[config.act_fn_]
+            if isinstance(config.act_fn_, str)
+            else config.act_fn_
+        )
+        self.experts_: int = config.num_experts_
+        self.topk_: int = config.top_k_
+        self.jitter_noise_: float = config.jitter_noise_
+        self.router_profile_: bool = False
+        self.profiler_: List[int] = None
+        if gate is None:
+            torch.nn.init.normal_(
+                self.gate_.weight,
+                mean=0.0,
+                std=config.router_init_range_,
+            )
+        else:
+            with torch.no_grad():
+                self.gate_.weight.copy_(gate)
+    def state_dict(self) -> Dict[str, torch.nn.Module]:
+        return {"gate": self.gate_.weight}
+    def _profiling(
+        self, batch_size: int, sequence_length: int, selected_experts: torch.Tensor
+    ) -> None:
+        if not self.router_profile_:
+            return
+        router_statistic_ = list(0 for _ in range(self.experts_))
+        for selected in selected_experts.tolist():
+            for idx in selected:
+                router_statistic_[idx] += 1
+        if self.profiler_ is None:
+            self.profiler_ = list(0 for _ in range(self.experts_))
+            for idx in range(self.experts_):
+                self.profiler_[idx] = (
+                    router_statistic_[idx] / batch_size
+                ) / sequence_length
+        else:
+            for idx in range(self.experts_):
+                pressure = (router_statistic_[idx] / batch_size) / sequence_length
+                self.profiler_[idx] = (self.profiler_[idx] + pressure) / 2
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        ffn_layer: LLMFeedForward,
+        input_args: LLMModelInput,
+    ) -> Tuple:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        if not input_args.inference_mode_ and self.jitter_noise_ > 0:
+            # Multiply the token inputs by the uniform distribution - adding some noise
+            hidden_states *= torch.empty_like(hidden_states).uniform_(
+                1.0 - self.jitter_noise_, 1.0 + self.jitter_noise_
+            )
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.view(-1, hidden_dim).to(self.dtype_)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate_(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=self.dtype_)
+        routing_weights, selected_experts = torch.topk(
+            routing_weights, self.topk_, dim=-1
+        )
+        self._profiling(batch_size, sequence_length, selected_experts)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim),
+            dtype=self.dtype_,
+            device=hidden_states.device,
+        )
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts, num_classes=self.experts_
+        ).permute(2, 1, 0)
+        # Perform the computation on each expert
+        if input_args.efficient_operator_ and hasattr(ffn_layer, "_mixlora_forward"):
+            expert_states = ffn_layer._mixlora_forward(
+                self.adapter_name_, self.act_, expert_mask, hidden_states, input_dtype
+            )
+        else:
+            expert_states = _mixlora_compatible_forward(
+                ffn_layer,
+                self.adapter_name_,
+                self.act_,
+                expert_mask,
+                hidden_states,
+                input_dtype,
+            )
+        # Unpack
+        for expert_idx in range(self.experts_):
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_hidden_states = (
+                expert_states[expert_idx] * routing_weights[top_x, idx, None]
+            )
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(
+                0, top_x, current_hidden_states.to(self.dtype_)
+            )
+        final_hidden_states = final_hidden_states.reshape(
+            batch_size, sequence_length, hidden_dim
+        ).to(input_dtype)
+        return final_hidden_states, router_logits
+def _dynamic_top_p(router_logits: torch.Tensor, top_p: float, temperature: float = 0.0):
+    if temperature > 0.0:
+        router_logits = router_logits / temperature
+    sorted_logits, sorted_indices = torch.sort(router_logits, dim=-1, descending=True)
+    cumulative_probs = sorted_logits.cumsum(dim=-1)
+    expert_mask = cumulative_probs > top_p
+    threshold_indices = expert_mask.long().argmax(dim=-1)
+    threshold_mask = torch.nn.functional.one_hot(
+        threshold_indices, num_classes=sorted_indices.size(-1)
+    ).bool()
+    expert_mask = expert_mask & ~threshold_mask
+    sorted_logits = sorted_logits.masked_fill(expert_mask, 0.0)
+    sorted_indices = sorted_indices.masked_fill(expert_mask, -1)
+    return sorted_logits, sorted_indices
+def _dynamic_load_balancing_loss_func(
+    routing_weights: torch.Tensor,
+    num_experts: int,
+    top_p: float,
+    temperature: float,
+) -> float:
+    _, selected_experts = _dynamic_top_p(routing_weights, top_p, temperature)
+    expert_mask = torch.empty(
+        (num_experts, num_experts, routing_weights.size(0)),
+        dtype=routing_weights.dtype,
+        device=routing_weights.device,
+    )
+    for expert_idx in range(num_experts):
+        expert_mask[expert_idx] = (selected_experts == expert_idx).transpose(0, 1)
+    expert_mask = expert_mask.permute(2, 1, 0)
+    # Compute the percentage of tokens routed to each experts
+    tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+    # Compute the average probability of routing to these experts
+    router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+class DynamicRouterLoss(torch.nn.Module):
+    def __init__(self, config: MixLoraConfig) -> None:
+        super().__init__()
+        self.aux_loss_coef = config.router_aux_loss_coef_
+        self.experts = config.num_experts_
+        self.top_p = config.top_p_
+        self.temperature = config.temperature_
+    def forward(self, gate_logits, attention_mask) -> torch.Tensor:
+        routing_weights = torch.nn.functional.softmax(gate_logits, dim=-1)
+        return self.aux_loss_coef * _dynamic_load_balancing_loss_func(
+            routing_weights,
+            self.experts,
+            self.top_p,
+            self.temperature,
+        )
+class DynamicSparseMoe(LLMMoeBlock):
+    def __init__(
+        self,
+        in_features: int,
+        device: torch.device,
+        config: MixLoraConfig,
+        gate: Optional[torch.Tensor] = None,
+    ) -> None:
+        super().__init__()
+        self.adapter_name_: str = config.adapter_name
+        self.dtype_: torch.dtype = torch.float32
+        self.gate_ = torch.nn.Linear(
+            in_features,
+            config.num_experts_,
+            bias=False,
+            device=device,
+            dtype=self.dtype_,
+        )
+        self.act_ = (
+            ACT2FN[config.act_fn_]
+            if isinstance(config.act_fn_, str)
+            else config.act_fn_
+        )
+        self.experts_: int = config.num_experts_
+        self.top_p_: float = config.top_p_
+        self.temperature_: float = config.temperature_
+        self.jitter_noise_: float = config.jitter_noise_
+        self.router_profile_: bool = False
+        self.profiler_: List[int] = None
+        if gate is None:
+            torch.nn.init.normal_(
+                self.gate_.weight,
+                mean=0.0,
+                std=config.router_init_range_,
+            )
+        else:
+            with torch.no_grad():
+                self.gate_.weight.copy_(gate)
+    def state_dict(self) -> Dict[str, torch.nn.Module]:
+        return {"gate": self.gate_.weight}
+    def _profiling(
+        self, batch_size: int, sequence_length: int, selected_experts: torch.Tensor
+    ) -> None:
+        if not self.router_profile_:
+            return
+        router_statistic_ = list(0 for _ in range(self.experts_))
+        for selected in selected_experts.tolist():
+            for idx in selected:
+                router_statistic_[idx] += 1
+        if self.profiler_ is None:
+            self.profiler_ = list(0 for _ in range(self.experts_))
+            for idx in range(self.experts_):
+                self.profiler_[idx] = (
+                    router_statistic_[idx] / batch_size
+                ) / sequence_length
+        else:
+            for idx in range(self.experts_):
+                pressure = (router_statistic_[idx] / batch_size) / sequence_length
+                self.profiler_[idx] = (self.profiler_[idx] + pressure) / 2
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        ffn_layer: LLMFeedForward,
+        input_args: LLMModelInput,
+    ) -> Tuple:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        if not input_args.inference_mode_ and self.jitter_noise_ > 0:
+            # Multiply the token inputs by the uniform distribution - adding some noise
+            hidden_states *= torch.empty_like(hidden_states).uniform_(
+                1.0 - self.jitter_noise_, 1.0 + self.jitter_noise_
+            )
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.view(-1, hidden_dim).to(self.dtype_)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate_(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=self.dtype_)
+        routing_weights, selected_experts = _dynamic_top_p(
+            routing_weights, self.top_p_, self.temperature_
+        )
+        self._profiling(batch_size, sequence_length, selected_experts)
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim),
+            dtype=self.dtype_,
+            device=hidden_states.device,
+        )
+        expert_mask = torch.empty(
+            (self.experts_, self.experts_, batch_size * sequence_length),
+            dtype=self.dtype_,
+            device=hidden_states.device,
+        )
+        for expert_idx in range(self.experts_):
+            expert_mask[expert_idx] = (selected_experts == expert_idx).transpose(0, 1)
+        # Perform the computation on each expert
+        if input_args.efficient_operator_ and hasattr(ffn_layer, "_mixlora_forward"):
+            expert_states = ffn_layer._mixlora_forward(
+                self.adapter_name_, self.act_, expert_mask, hidden_states, input_dtype
+            )
+        else:
+            expert_states = _mixlora_compatible_forward(
+                ffn_layer,
+                self.adapter_name_,
+                self.act_,
+                expert_mask,
+                hidden_states,
+                input_dtype,
+            )
+        # Unpack
+        for expert_idx in range(self.experts_):
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_hidden_states = (
+                expert_states[expert_idx] * routing_weights[top_x, idx, None]
+            )
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(
+                0, top_x, current_hidden_states.to(self.dtype_)
+            )
+        final_hidden_states = final_hidden_states.reshape(
+            batch_size, sequence_length, hidden_dim
+        ).to(input_dtype)
+        return final_hidden_states, router_logits
+def _switch_router_z_loss_func(router_logits: torch.Tensor) -> float:
+    log_z = torch.logsumexp(router_logits, dim=-1)
+    z_loss = log_z**2
+    return torch.sum(z_loss) / (router_logits.size(0))
+def _switch_load_balancing_loss_func(router_probs: torch.Tensor) -> float:
+    num_experts = router_probs.size(-1)
+    expert_mask = torch.argmax(router_probs, dim=-1)
+    expert_mask = torch.nn.functional.one_hot(expert_mask, num_classes=num_experts)
+    tokens_per_group_and_expert = torch.mean(expert_mask.float(), dim=0)
+    router_prob_per_group_and_expert = torch.mean(router_probs, dim=0)
+    return torch.mean(
+        tokens_per_group_and_expert * router_prob_per_group_and_expert
+    ) * (num_experts**2)
+class SwitchRouterLoss(torch.nn.Module):
+    def __init__(self, config: MixLoraConfig) -> None:
+        super().__init__()
+        self.experts = config.num_experts_
+        self.expert_capacity_ = config.expert_capacity_
+        self.z_loss_coef = config.router_z_loss_coef_
+        self.aux_loss_coef = config.router_aux_loss_coef_
+    def forward(self, router_logits, attention_mask) -> torch.Tensor:
+        z_loss = _switch_router_z_loss_func(router_logits)
+        router_probs = F.softmax(router_logits, dim=-1)
+        # recompute expert indexes due to MoE-PEFT constraints
+        aux_loss = _switch_load_balancing_loss_func(router_probs)
+        return self.z_loss_coef * z_loss + self.aux_loss_coef * aux_loss
+class SwitchSparseMoe(LLMMoeBlock):
+    def __init__(
+        self,
+        in_features: int,
+        device: torch.device,
+        config: MixLoraConfig,
+        gate: Optional[torch.Tensor] = None,
+    ) -> None:
+        super().__init__()
+        self.adapter_name_: str = config.adapter_name
+        self.dtype_: torch.dtype = torch.float32
+        self.gate_ = torch.nn.Linear(
+            in_features,
+            config.num_experts_,
+            bias=False,
+            device=device,
+            dtype=self.dtype_,
+        )
+        self.act_ = (
+            ACT2FN[config.act_fn_]
+            if isinstance(config.act_fn_, str)
+            else config.act_fn_
+        )
+        self.experts_: int = config.num_experts_
+        self.dropout_ = (
+            torch.nn.Dropout(config.ffn_dropout_)
+            if config.ffn_dropout_ > 0
+            else torch.nn.Identity()
+        )
+        self.expert_capacity_: int = config.expert_capacity_
+        self.jitter_noise_: float = config.jitter_noise_
+        self.router_profile_: bool = False
+        self.profiler_: List[int] = None
+        if gate is None:
+            torch.nn.init.normal_(
+                self.gate_.weight,
+                mean=0.0,
+                std=config.router_init_range_,
+            )
+        else:
+            with torch.no_grad():
+                self.gate_.weight.copy_(gate)
+    def _profiling(
+        self, batch_size: int, sequence_length: int, router_mask: torch.Tensor
+    ) -> None:
+        if not self.router_profile_:
+            return
+        selected_experts = torch.argmax(router_mask, dim=-1)
+        router_statistic_ = list(0 for _ in range(self.experts_))
+        for selected in selected_experts.tolist():
+            for idx in selected:
+                router_statistic_[idx] += 1
+        if self.profiler_ is None:
+            self.profiler_ = list(0 for _ in range(self.experts_))
+            for idx in range(self.experts_):
+                self.profiler_[idx] = (
+                    router_statistic_[idx] / batch_size
+                ) / sequence_length
+        else:
+            for idx in range(self.experts_):
+                pressure = (router_statistic_[idx] / batch_size) / sequence_length
+                self.profiler_[idx] = (self.profiler_[idx] + pressure) / 2
+    def route(self, hidden_states: torch.Tensor, input_args: LLMModelInput) -> Tuple:
+        if not input_args.inference_mode_ and self.jitter_noise_ > 0:
+            # Multiply the token inputs by the uniform distribution - adding some noise
+            hidden_states = hidden_states * torch.empty_like(hidden_states).uniform_(
+                1.0 - self.jitter_noise_, 1.0 + self.jitter_noise_
+            )
+        # Apply Softmax
+        router_logits = self.gate_(hidden_states)
+        router_probs = F.softmax(router_logits, dim=-1, dtype=self.dtype_)
+        expert_index = torch.argmax(router_probs, dim=-1)
+        expert_index = torch.nn.functional.one_hot(
+            expert_index, num_classes=self.experts_
+        )
+        # Mask tokens outside expert capacity. Sum over each sequence
+        token_priority = torch.cumsum(expert_index, dim=-2)
+        # mask if the token routed to to the expert will overflow
+        expert_capacity_mask = token_priority <= self.expert_capacity_
+        expert_index = expert_index * expert_capacity_mask
+        router_probs = torch.max(router_probs, dim=-1).values.unsqueeze(-1)
+        return expert_index, router_probs, router_logits
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        ffn_layer: LLMFeedForward,
+        input_args: LLMModelInput,
+    ) -> Tuple:
+        batch_size, sequence_length, _ = hidden_states.shape
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(self.dtype_)
+        router_mask, router_probs, router_logits = self.route(hidden_states, input_args)
+        self._profiling(batch_size, sequence_length, router_mask)
+        next_states = hidden_states.clone()
+        for expert_idx in range(self.experts_):
+            token_indices = router_mask[:, :, expert_idx].bool()
+            lora_name = f"moe.{self.adapter_name_}.experts.{expert_idx}"
+            next_states[token_indices] = ffn_layer._lora_forward(
+                lora_name, self.act_, hidden_states[token_indices].to(input_dtype)
+            ).to(next_states.dtype)
+        if input_args.inference_mode_:
+            hidden_states = hidden_states.to(input_dtype)
+        else:
+            hidden_states = self.dropout_(router_probs * next_states).to(input_dtype)
+        return hidden_states, router_logits.reshape(-1, self.experts_)

c2cite/adapters/mola/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .config import MolaConfig
+from .model import MolaRouterLoss, MolaSparseMoe
+__all__ = [
+    "MolaConfig",
+    "MolaSparseMoe",
+    "MolaRouterLoss",
+]

c2cite/adapters/mola/config.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import copy
+from dataclasses import dataclass
+from typing import Dict
+from moe_peft.common import LoraConfig
+@dataclass
+class MolaConfig(LoraConfig):
+    top_k_: int = None
+    num_experts_: int = None
+    routing_strategy_: str = "mola"
+    router_init_range_: float = None
+    # this router loss is copied from MixLoRA
+    # and only for test MoE-PEFT propose
+    router_aux_loss_coef_: float = None
+    router_loss_: bool = True
+    def check(self) -> "MolaConfig":
+        super().check()
+        assert isinstance(self.top_k_, int) and self.top_k_ > 0
+        assert isinstance(self.num_experts_, int) and self.num_experts_ > 0
+        assert (
+            isinstance(self.router_init_range_, float) and self.router_init_range_ >= 0
+        )
+        assert (
+            isinstance(self.router_aux_loss_coef_, float)
+            and self.router_aux_loss_coef_ >= 0
+        )
+        assert isinstance(self.router_loss_, bool)
+        return self
+    @staticmethod
+    def from_config(config: Dict[str, any]) -> "MolaConfig":
+        return MolaConfig(
+            top_k_=config.get("top_k", 2),
+            num_experts_=config["num_experts"],
+            router_init_range_=config.get("router_init_range", 5.0),
+            router_aux_loss_coef_=config.get("router_aux_loss_coef", 0.001),
+            router_loss_=config.get("router_loss", False),
+            **LoraConfig.from_config(config).__dict__,
+        )
+    def export(self) -> Dict[str, any]:
+        config = super().export()
+        config["peft_type"] = "MOLA"
+        config["routing_strategy"] = self.routing_strategy_
+        config["num_experts"] = self.num_experts_
+        config["top_k"] = self.top_k_
+        return config
+    def expert_config(self, expert_idx: int) -> LoraConfig:
+        config = copy.deepcopy(super())
+        config.adapter_name = f"moe.{self.adapter_name}.experts.{expert_idx}"
+        return config

c2cite/adapters/mola/model.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import math
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from moe_peft.common import Linear, LLMMoeBlock
+from .config import MolaConfig
+# copied from mixlora.model._mixtral_load_balancing_loss_func
+def _mixtral_load_balancing_loss_func(
+    gate_logits: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> float:
+    routing_weights = torch.nn.functional.softmax(gate_logits, dim=-1)
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = routing_weights.shape[0] // (batch_size * sequence_length)
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand(
+                (num_hidden_layers, batch_size, sequence_length, top_k, num_experts)
+            )
+            .reshape(-1, top_k, num_experts)
+            .to(routing_weights.device)
+        )
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(
+            expert_mask.float() * expert_attention_mask, dim=0
+        ) / torch.sum(expert_attention_mask, dim=0)
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(routing_weights.device)
+        )
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(
+            routing_weights * router_per_expert_attention_mask, dim=0
+        ) / torch.sum(router_per_expert_attention_mask, dim=0)
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+class MolaRouterLoss(torch.nn.Module):
+    def __init__(self, config: MolaConfig) -> None:
+        super().__init__()
+        self.aux_loss_coef = config.router_aux_loss_coef_
+        self.experts = config.num_experts_
+        self.topk = config.top_k_
+    def forward(self, gate_logits, attention_mask) -> torch.Tensor:
+        return self.aux_loss_coef * _mixtral_load_balancing_loss_func(
+            gate_logits, self.experts, self.topk, attention_mask
+        )
+class MolaSparseMoe(LLMMoeBlock):
+    def __init__(
+        self,
+        in_features: int,
+        device: torch.device,
+        config: MolaConfig,
+        gate: Optional[torch.Tensor] = None,
+    ) -> None:
+        super().__init__()
+        self.adapter_name_: str = config.adapter_name
+        self.dtype_: torch.dtype = torch.float32
+        self.gate_ = torch.nn.Linear(
+            in_features,
+            config.num_experts_,
+            bias=False,
+            device=device,
+            dtype=torch.float32,
+        )
+        self.experts_ = config.num_experts_
+        self.topk_ = config.top_k_
+        self.router_logits_: torch.Tensor = None
+        if gate is None:
+            torch.nn.init.kaiming_uniform_(
+                self.gate_.weight, a=math.sqrt(config.router_init_range_)
+            )
+        else:
+            with torch.no_grad():
+                self.gate_.weight.copy_(gate)
+    def forward(
+        self,
+        residual: torch.Tensor,
+        hidden_states: torch.Tensor,
+        lora_linear: Optional[Linear] = None,
+    ):
+        assert lora_linear is not None
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.view(-1, hidden_dim).to(self.dtype_)
+        router_logits = self.gate_(hidden_states)
+        self.router_logits_ = router_logits.reshape(-1, self.experts_)
+        routing_weights_before = F.softmax(router_logits, dim=1, dtype=self.dtype_)
+        routing_weights, selected_experts = torch.topk(
+            routing_weights_before, self.topk_, dim=-1
+        )
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts, num_classes=self.experts_
+        ).permute(2, 1, 0)
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, lora_linear.out_features_),
+            dtype=self.dtype_,
+            device=hidden_states.device,
+        )
+        for expert_idx in range(self.experts_):
+            expert_lora = lora_linear.loras_[
+                f"moe.{self.adapter_name_}.experts.{expert_idx}"
+            ]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = (
+                expert_lora.lora_forward(current_state)
+                * routing_weights[top_x, idx, None]
+            )
+            final_hidden_states.index_add_(
+                0, top_x, current_hidden_states.to(self.dtype_)
+            )
+        final_hidden_states = final_hidden_states.reshape(
+            batch_size, sequence_length, lora_linear.out_features_
+        ).to(input_dtype)
+        return residual + final_hidden_states

c2cite/common/__init__.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Basic Abstract Class
+from .abstracts import (
+    LLMAttention,
+    LLMCache,
+    LLMDecoder,
+    LLMFeedForward,
+    LLMForCausalLM,
+    LLMMoeBlock,
+    LLMOutput,
+)
+from .attention import (
+    eager_attention_forward,
+    flash_attention_forward,
+    prepare_4d_causal_attention_mask,
+)
+from .cache import (
+    DynamicCache,
+    HybridCache,
+    SlidingWindowCache,
+    StaticCache,
+    cache_factory,
+)
+from .checkpoint import (
+    CHECKPOINT_CLASSES,
+    CheckpointNoneFunction,
+    CheckpointOffloadFunction,
+    CheckpointRecomputeFunction,
+)
+# Model Configuration
+from .config import (
+    AdapterConfig,
+    InputData,
+    Labels,
+    LLMBatchConfig,
+    LLMModelConfig,
+    LLMModelInput,
+    LLMModelOutput,
+    LoraConfig,
+    Masks,
+    Prompt,
+    Tokens,
+)
+from .feed_forward import FeedForward
+# LoRA
+from .lora_linear import Linear, Lora, get_range_tensor
+# MoEs
+from .moe_utils import collect_plugin_router_logtis, slice_tensor, unpack_router_logits
+from .rope import ROPE_INIT_FUNCTIONS
+__all__ = [
+    "prepare_4d_causal_attention_mask",
+    "eager_attention_forward",
+    "flash_attention_forward",
+    "LLMCache",
+    "DynamicCache",
+    "HybridCache",
+    "SlidingWindowCache",
+    "StaticCache",
+    "cache_factory",
+    "CheckpointNoneFunction",
+    "CheckpointOffloadFunction",
+    "CheckpointRecomputeFunction",
+    "CHECKPOINT_CLASSES",
+    "FeedForward",
+    "slice_tensor",
+    "unpack_router_logits",
+    "collect_plugin_router_logtis",
+    "get_range_tensor",
+    "Lora",
+    "Linear",
+    "LLMAttention",
+    "LLMFeedForward",
+    "LLMMoeBlock",
+    "LLMDecoder",
+    "LLMOutput",
+    "LLMForCausalLM",
+    "Tokens",
+    "Labels",
+    "Masks",
+    "Prompt",
+    "InputData",
+    "LLMModelConfig",
+    "LLMModelOutput",
+    "LLMBatchConfig",
+    "LLMModelInput",
+    "AdapterConfig",
+    "LoraConfig",
+    "ROPE_INIT_FUNCTIONS",
+]

c2cite/common/abstracts.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from abc import ABCMeta
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from .config import LLMModelConfig, LLMModelInput
+class LLMCache(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError("Make sure to implement `update` in a subclass.")
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        # TODO: deprecate this function in favor of `cache_position`
+        raise NotImplementedError(
+            "Make sure to implement `get_seq_length` in a subclass."
+        )
+    def get_max_length(self) -> Optional[int]:
+        raise NotImplementedError(
+            "Make sure to implement `get_max_length` in a subclass."
+        )
+    def get_usable_length(
+        self, new_seq_length: int, layer_idx: Optional[int] = 0
+    ) -> int:
+        max_length = self.get_max_length()
+        previous_seq_length = self.get_seq_length(layer_idx)
+        if max_length is not None and previous_seq_length + new_seq_length > max_length:
+            return max_length - new_seq_length
+        return previous_seq_length
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(
+                0, beam_idx.to(device)
+            )
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(
+                0, beam_idx.to(device)
+            )
+class LLMAttention(metaclass=ABCMeta):
+    @classmethod
+    def state_dict(self) -> Dict[str, torch.nn.Module]:
+        return {}
+    @classmethod
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        pass
+class LLMFeedForward(metaclass=ABCMeta):
+    @classmethod
+    def state_dict(self) -> Dict[str, torch.nn.Module]:
+        return {}
+    @classmethod
+    def _batch_forward(
+        self, hidden_states: torch.Tensor, input_args: LLMModelInput
+    ) -> torch.Tensor:
+        pass
+    @classmethod
+    def _lora_forward(
+        self, lora_name: str, act_fn: torch.nn.Module, data: torch.Tensor
+    ) -> torch.Tensor:
+        pass
+class LLMMoeBlock(metaclass=ABCMeta):
+    def __init__(self) -> None:
+        super().__init__()
+        self.adapter_name_: str = None
+        self.dtype_: torch.dtype = None
+        self.gate_: torch.nn.Linear = None
+        self.experts_: int = None
+        self.router_profile_: bool = False
+        self.profiler_: List[int] = None
+    @classmethod
+    def forward(
+        self,
+        residual: torch.Tensor,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> Tuple:
+        pass
+class LLMDecoder(metaclass=ABCMeta):
+    def __init__(self) -> None:
+        super().__init__()
+        self.self_attn_: LLMAttention = None
+        self.mlp_: LLMFeedForward = None
+    @classmethod
+    def state_dict(
+        self,
+    ) -> Tuple[Dict[str, torch.nn.Module], Dict[str, torch.nn.Module]]:
+        return {}
+    @classmethod
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        pass
+class LLMOutput(metaclass=ABCMeta):
+    @classmethod
+    def state_dict(self) -> Dict[str, torch.nn.Module]:
+        return {}
+    @classmethod
+    def forward(self, data: torch.Tensor) -> torch.Tensor:
+        pass
+    @classmethod
+    def loss(
+        self,
+        input_ids: torch.Tensor,
+        output_logits: torch.Tensor,
+        labels: List[List[int]],
+    ) -> torch.Tensor:
+        pass
+class LLMForCausalLM(metaclass=ABCMeta):
+    @classmethod
+    def embed_tokens(self, input_ids: torch.Tensor) -> torch.Tensor:
+        pass
+    @classmethod
+    def rotary_embed(
+        self, input_tensor: torch.Tensor, position_ids: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        pass
+    @classmethod
+    def decoder_stack(self) -> List[LLMDecoder]:
+        pass
+    @classmethod
+    def norm(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        pass
+    @classmethod
+    def causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Optional[LLMCache],
+    ) -> torch.Tensor:
+        pass
+    @classmethod
+    def cache_implementation(self) -> str:
+        return "dynamic"
+    @classmethod
+    def model_config(self) -> LLMModelConfig:
+        pass
+    @staticmethod
+    def from_pretrained(llm_model, **kwargs):
+        pass

c2cite/common/attention.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import inspect
+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+from transformers.utils import is_flash_attn_2_available
+from .cache import LLMCache, StaticCache
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+    _flash_supports_window_size = "window_size" in list(
+        inspect.signature(flash_attn_func).parameters
+    )
+def prepare_4d_causal_attention_mask(
+    attention_mask: torch.Tensor,
+    input_tensor: torch.Tensor,
+    cache_position: torch.Tensor,
+    past_key_values: LLMCache,
+) -> torch.Tensor:
+    past_seen_tokens = (
+        past_key_values.get_seq_length() if past_key_values is not None else 0
+    )
+    if past_seen_tokens is None:
+        past_seen_tokens = 0
+    using_static_cache = isinstance(past_key_values, StaticCache)
+    dtype, device = input_tensor.dtype, input_tensor.device
+    min_dtype = torch.finfo(dtype).min
+    sequence_length = input_tensor.shape[1]
+    if using_static_cache:
+        target_length = past_key_values.get_max_length()
+    else:
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+    causal_mask = torch.full(
+        (sequence_length, target_length),
+        fill_value=min_dtype,
+        dtype=dtype,
+        device=device,
+    )
+    if sequence_length != 1:
+        causal_mask = torch.triu(causal_mask, diagonal=1)
+    causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(
+        -1, 1
+    )
+    causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+    if attention_mask is not None:
+        causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+        mask_length = attention_mask.shape[-1]
+        padding_mask = (
+            causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+        )
+        padding_mask = padding_mask == 0
+        causal_mask[:, :, :, :mask_length] = causal_mask[
+            :, :, :, :mask_length
+        ].masked_fill(padding_mask, min_dtype)
+    return causal_mask
+def eager_attention_forward(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    attention_score = torch.matmul(
+        query_states, key_states.transpose(2, 3)
+    ) / math.sqrt(query_states.size(-1))
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attention_score = attention_score + causal_mask
+    attention_score = F.softmax(attention_score, dim=-1, dtype=torch.float32).to(
+        value_states.dtype
+    )
+    attention_matrix = attention_score
+    attention_score = torch.matmul(attention_score, value_states)
+    attention_score = attention_score.transpose(1, 2).contiguous()
+    return attention_score, attention_matrix
+def _get_unpad_data(
+    attention_mask: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, int]:
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+def _upad_input(
+    query_layer: torch.Tensor,
+    key_layer: torch.Tensor,
+    value_layer: torch.Tensor,
+    attention_mask: torch.Tensor,
+    query_length: int,
+):
+    indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+    batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+    key_layer = index_first_axis(
+        key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+        indices_k,
+    )
+    value_layer = index_first_axis(
+        value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+        indices_k,
+    )
+    if query_length == kv_seq_len:
+        query_layer = index_first_axis(
+            query_layer.reshape(batch_size * kv_seq_len, -1, head_dim), indices_k
+        )
+        cu_seqlens_q = cu_seqlens_k
+        max_seqlen_in_batch_q = max_seqlen_in_batch_k
+        indices_q = indices_k
+    elif query_length == 1:
+        max_seqlen_in_batch_q = 1
+        cu_seqlens_q = torch.arange(
+            batch_size + 1, dtype=torch.int32, device=query_layer.device
+        )  # There is a memcpy here, that is very bad.
+        indices_q = cu_seqlens_q[:-1]
+        query_layer = query_layer.squeeze(1)
+    else:
+        # The -q_len: slice assumes left padding.
+        attention_mask = attention_mask[:, -query_length:]
+        query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
+            query_layer, attention_mask
+        )
+    return (
+        query_layer,
+        key_layer,
+        value_layer,
+        indices_q,
+        (cu_seqlens_q, cu_seqlens_k),
+        (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+    )
+def prepare_fa2_from_position_ids(query, key, value, position_ids):
+    query = query.view(-1, query.size(-2), query.size(-1))
+    key = key.view(-1, key.size(-2), key.size(-1))
+    value = value.view(-1, value.size(-2), value.size(-1))
+    position_ids = position_ids.flatten()
+    indices_q = torch.arange(
+        position_ids.size(0), device=position_ids.device, dtype=torch.int32
+    )
+    cu_seq_lens = torch.cat(
+        (
+            indices_q[position_ids == 0],
+            torch.tensor(
+                position_ids.size(), device=position_ids.device, dtype=torch.int32
+            ),
+        )
+    )
+    max_length = position_ids.max() + 1
+    return (
+        query,
+        key,
+        value,
+        indices_q,
+        (cu_seq_lens, cu_seq_lens),
+        (max_length, max_length),
+    )
+def flash_attention_forward(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+    query_length: int,
+    is_causal: bool,
+    dropout: float = 0.0,
+    position_ids: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    sliding_window: Optional[int] = None,
+    use_top_left_mask: bool = False,
+    softcap: Optional[float] = None,
+    deterministic: Optional[bool] = None,
+):
+    if not use_top_left_mask:
+        causal = is_causal
+    else:
+        causal = is_causal and query_length != 1
+    # Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).
+    use_sliding_windows = (
+        _flash_supports_window_size
+        and sliding_window is not None
+        and key_states.shape[1] > sliding_window
+    )
+    flash_kwargs = (
+        {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {}
+    )
+    if deterministic is not None:
+        flash_kwargs["deterministic"] = deterministic
+    if softcap is not None:
+        flash_kwargs["softcap"] = softcap
+    # Contains at least one padding token in the sequence
+    if attention_mask is not None:
+        batch_size = query_states.shape[0]
+        query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = (
+            _upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+        )
+        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+        attn_output_unpad = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_in_batch_q,
+            max_seqlen_k=max_seqlen_in_batch_k,
+            dropout_p=dropout,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            **flash_kwargs,
+        )
+        attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+    elif (
+        position_ids is not None
+        and not (torch.diff(position_ids, dim=-1) >= 0).all()
+        and query_length != 1
+    ):
+        batch_size = query_states.size(0)
+        query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = (
+            prepare_fa2_from_position_ids(
+                query_states, key_states, value_states, position_ids
+            )
+        )
+        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+        attn_output = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_in_batch_q,
+            max_seqlen_k=max_seqlen_in_batch_k,
+            dropout_p=dropout,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            **flash_kwargs,
+        )
+        attn_output = attn_output.view(
+            batch_size, -1, attn_output.size(-2), attn_output.size(-1)
+        )
+    else:
+        attn_output = flash_attn_func(
+            query_states,
+            key_states,
+            value_states,
+            dropout,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            **flash_kwargs,
+        )
+    return attn_output

c2cite/common/cache.py ADDED Viewed

	@@ -0,0 +1,554 @@

+import logging
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from transformers.utils import is_torchdynamo_compiling
+from .abstracts import LLMCache
+from .config import LLMModelConfig
+class DynamicCache(LLMCache):
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+        self._seen_tokens = (
+            0  # Used in `generate` to keep tally of how many tokens the cache has seen
+        )
+        self.key_cache: List[torch.Tensor] = []
+        self.value_cache: List[torch.Tensor] = []
+    def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
+        """
+        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
+        sequence length.
+        """
+        if layer_idx < len(self):
+            return (self.key_cache[layer_idx], self.value_cache[layer_idx])
+        else:
+            raise KeyError(
+                f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}"
+            )
+    def __iter__(self):
+        """
+        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
+        keys and values
+        """
+        for layer_idx in range(len(self)):
+            yield (self.key_cache[layer_idx], self.value_cache[layer_idx])
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.key_cache)
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += key_states.shape[-2]
+        # Update the cache
+        if len(self.key_cache) <= layer_idx:
+            # There may be skipped layers, fill them with empty lists
+            for _ in range(len(self.key_cache), layer_idx):
+                self.key_cache.append([])
+                self.value_cache.append([])
+            self.key_cache.append(key_states)
+            self.value_cache.append(value_states)
+        elif (
+            len(self.key_cache[layer_idx]) == 0
+        ):  # fills previously skipped layers; checking for tensor causes errors
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat(
+                [self.key_cache[layer_idx], key_states], dim=-2
+            )
+            self.value_cache[layer_idx] = torch.cat(
+                [self.value_cache[layer_idx], value_states], dim=-2
+            )
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # TODO: deprecate this function in favor of `cache_position`
+        is_empty_layer = (
+            len(self.key_cache) == 0  # no cache in any layer
+            or len(self.key_cache)
+            <= layer_idx  # skipped `layer_idx` and hasn't run a layer with cache after it
+            or len(self.key_cache[layer_idx]) == 0  # the layer has no cache
+        )
+        layer_seq_length = (
+            self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
+        )
+        return layer_seq_length
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+        return None
+    def crop(self, max_length: int):
+        """Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be
+        negative to remove `max_length` tokens. This is used in assisted decoding and contrastive search.
+        """
+        # In case it is negative
+        if max_length < 0:
+            max_length = self.get_seq_length() - abs(max_length)
+        if self.get_seq_length() <= max_length:
+            return
+        self._seen_tokens = max_length
+        for idx in range(len(self.key_cache)):
+            if self.key_cache[idx] != []:
+                self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
+                self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
+    def batch_split(
+        self, full_batch_size: int, split_size: int
+    ) -> List["DynamicCache"]:
+        """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
+        `_split_model_inputs()` in `generation.utils`"""
+        out = []
+        for i in range(0, full_batch_size, split_size):
+            current_split = DynamicCache()
+            current_split._seen_tokens = self._seen_tokens
+            current_split.key_cache = [
+                tensor[i : i + split_size] for tensor in self.key_cache
+            ]
+            current_split.value_cache = [
+                tensor[i : i + split_size] for tensor in self.value_cache
+            ]
+            out.append(current_split)
+        return out
+    @classmethod
+    def from_batch_splits(cls, splits: List["DynamicCache"]) -> "DynamicCache":
+        """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
+        `generation.utils`"""
+        cache = cls()
+        for idx in range(len(splits[0])):
+            key_cache = [
+                current.key_cache[idx]
+                for current in splits
+                if current.key_cache[idx] != []
+            ]
+            value_cache = [
+                current.key_cache[idx]
+                for current in splits
+                if current.key_cache[idx] != []
+            ]
+            if key_cache != []:
+                layer_keys = torch.cat(key_cache, dim=0)
+                layer_values = torch.cat(value_cache, dim=0)
+                cache.update(layer_keys, layer_values, idx)
+        return cache
+    def batch_repeat_interleave(self, repeats: int):
+        """Repeat the cache `repeats` times in the batch dimension. Used in contrastive search."""
+        for layer_idx in range(len(self)):
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].repeat_interleave(
+                repeats, dim=0
+            )
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].repeat_interleave(
+                repeats, dim=0
+            )
+    def batch_select_indices(self, indices: torch.Tensor):
+        """Only keep the `indices` in the batch dimension of the cache. Used in contrastive search."""
+        for layer_idx in range(len(self)):
+            self.key_cache[layer_idx] = self.key_cache[layer_idx][indices, ...]
+            self.value_cache[layer_idx] = self.value_cache[layer_idx][indices, ...]
+class StaticCache(LLMCache):
+    def __init__(
+        self,
+        config: LLMModelConfig,
+        batch_size: int,
+        max_cache_len: int,
+        device: torch.device,
+        dtype: torch.dtype = torch.float32,
+    ) -> None:
+        super().__init__()
+        self.batch_size = batch_size
+        self.max_cache_len = (
+            config.max_seq_len_ if max_cache_len is None else max_cache_len
+        )
+        self.head_dim = config.head_dim_
+        self.dtype = dtype
+        self.num_key_value_heads = config.n_kv_heads_
+        self.key_cache: List[torch.Tensor] = []
+        self.value_cache: List[torch.Tensor] = []
+        # Note: There will be significant perf decrease if switching to use 5D tensors instead.
+        cache_shape = (
+            self.batch_size,
+            self.num_key_value_heads,
+            self.max_cache_len,
+            self.head_dim,
+        )
+        for idx in range(config.n_layers_):
+            new_layer_key_cache = torch.zeros(
+                cache_shape, dtype=self.dtype, device=device
+            )
+            new_layer_value_cache = torch.zeros(
+                cache_shape, dtype=self.dtype, device=device
+            )
+            # Notes:
+            # 1. `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
+            #     breaks when updating the cache. It can't be used if the cache code is being compiled (but in that case
+            #     it is not needed anyway)
+            # 2. `torch.export()` requires mutations to be registered as buffers.
+            if not is_torchdynamo_compiling():
+                self.register_buffer(
+                    f"key_cache_{idx}",
+                    torch.zeros(cache_shape, dtype=dtype, device=device),
+                )
+                self.register_buffer(
+                    f"value_cache_{idx}",
+                    torch.zeros(cache_shape, dtype=dtype, device=device),
+                )
+                new_layer_key_cache = getattr(self, f"key_cache_{idx}")
+                new_layer_value_cache = getattr(self, f"value_cache_{idx}")
+                torch._dynamo.mark_static_address(new_layer_key_cache)
+                torch._dynamo.mark_static_address(new_layer_value_cache)
+            self.key_cache.append(new_layer_key_cache)
+            self.value_cache.append(new_layer_value_cache)
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        cache_position = cache_kwargs.get("cache_position")
+        k_out = self.key_cache[layer_idx]
+        v_out = self.value_cache[layer_idx]
+        if cache_position is None:
+            k_out.copy_(key_states)
+            v_out.copy_(value_states)
+        else:
+            # Note: here we use `tensor.index_copy_(dim, index, tensor)` that is equivalent to
+            # `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place
+            # operation, that avoids copies and uses less memory.
+            try:
+                k_out.index_copy_(2, cache_position, key_states)
+                v_out.index_copy_(2, cache_position, value_states)
+            except NotImplementedError:
+                # The operator 'aten::index_copy.out' is not currently implemented for the MPS device.
+                k_out[:, :, cache_position] = key_states
+                v_out[:, :, cache_position] = value_states
+        return k_out, v_out
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states that were seen by the model."""
+        # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
+        # limit the check to the first batch member and head dimension.
+        # TODO: deprecate this function in favor of `cache_position`
+        return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states."""
+        return self.max_cache_len
+    def reset(self):
+        """Resets the cache values while preserving the objects"""
+        for layer_idx in range(len(self.key_cache)):
+            # In-place ops prevent breaking the static address
+            self.key_cache[layer_idx].zero_()
+            self.value_cache[layer_idx].zero_()
+class SlidingWindowCache(StaticCache):
+    def __init__(
+        self,
+        config: LLMModelConfig,
+        batch_size: int,
+        max_cache_len: int,
+        device: torch.device,
+        dtype: torch.dtype = torch.float32,
+    ) -> None:
+        super().__init__()
+        if not hasattr(config, "sliding_window_") or config.sliding_window_ is None:
+            raise ValueError(
+                "Setting `cache_implementation` to 'sliding_window' requires the model config supporting "
+                "sliding window attention, please check if there is a `sliding_window` field in the model "
+                "config and it's not set to None."
+            )
+        max_cache_len = min(config.sliding_window_, max_cache_len)
+        super().__init__(
+            config=config,
+            batch_size=batch_size,
+            max_cache_len=max_cache_len,
+            device=device,
+            dtype=dtype,
+        )
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor]:
+        cache_position = cache_kwargs.get("cache_position")
+        k_out = self.key_cache[layer_idx]
+        v_out = self.value_cache[layer_idx]
+        # assume this only happens in prefill phase when prompt length > sliding_window_size (= max_cache_len)
+        if cache_position.shape[0] > self.max_cache_len:
+            k_out = key_states[:, :, -self.max_cache_len :, :]
+            v_out = value_states[:, :, -self.max_cache_len :, :]
+            # Assumption: caches are all zeros at this point, `+=` is equivalent to `=` but compile-friendly
+            self.key_cache[layer_idx] += k_out
+            self.value_cache[layer_idx] += v_out
+            # we should return the whole states instead of k_out, v_out to take the whole prompt
+            # into consideration when building kv cache instead of just throwing away tokens outside of the window
+            return key_states, value_states
+        slicing = torch.ones(
+            self.max_cache_len, dtype=torch.long, device=value_states.device
+        ).cumsum(0)
+        cache_position = cache_position.clamp(0, self.max_cache_len - 1)
+        to_shift = cache_position >= self.max_cache_len - 1
+        indices = (slicing + to_shift[-1].int() - 1) % self.max_cache_len
+        k_out = k_out[:, :, indices]
+        v_out = v_out[:, :, indices]
+        try:
+            k_out.index_copy_(2, cache_position, key_states)
+            v_out.index_copy_(2, cache_position, value_states)
+        except NotImplementedError:
+            # The operator 'aten::index_copy.out' is not currently implemented for the MPS device.
+            k_out[:, :, cache_position] = key_states
+            v_out[:, :, cache_position] = value_states
+        # `_.zero()` followed by `+=` is equivalent `=`, but compile-friendly (without graph breaks due to assignment)
+        self.key_cache[layer_idx].zero_()
+        self.value_cache[layer_idx].zero_()
+        self.key_cache[layer_idx] += k_out
+        self.value_cache[layer_idx] += v_out
+        return k_out, v_out
+    def get_max_length(self) -> Optional[int]:
+        # in theory there is no limit because the sliding window size is fixed no matter how long the sentence is
+        return None
+    def reset(self):
+        for layer_idx in range(len(self.key_cache)):
+            # In-place ops prevent breaking the static address
+            self.key_cache[layer_idx].zero_()
+            self.value_cache[layer_idx].zero_()
+class HybridCache(LLMCache):
+    def __init__(
+        self,
+        config: LLMModelConfig,
+        batch_size: int,
+        max_cache_len: int,
+        device: torch.device,
+        dtype: torch.dtype = torch.float32,
+    ) -> None:
+        super().__init__()
+        if not hasattr(config, "sliding_window_") or config.sliding_window_ is None:
+            raise ValueError(
+                "Setting `cache_implementation` to 'sliding_window' requires the model config supporting "
+                "sliding window attention, please check if there is a `sliding_window` field in the model "
+                "config and it's not set to None."
+            )
+        self.max_cache_len = max_cache_len
+        self.batch_size = batch_size
+        self.head_dim = config.head_dim_
+        self.dtype = dtype
+        self.num_key_value_heads = config.n_kv_heads_
+        self.is_sliding = torch.tensor(
+            [not bool(i % 2) for i in range(config.n_layers_)],
+            dtype=torch.bool,
+            device=device,
+        )
+        self.key_cache: List[torch.Tensor] = []
+        self.value_cache: List[torch.Tensor] = []
+        global_cache_shape = (
+            self.batch_size,
+            self.num_key_value_heads,
+            max_cache_len,
+            self.head_dim,
+        )
+        sliding_cache_shape = (
+            self.batch_size,
+            self.num_key_value_heads,
+            min(config.sliding_window_, max_cache_len),
+            self.head_dim,
+        )
+        for i in range(config.n_layers_):
+            # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
+            # breaks when updating the cache.
+            cache_shape = (
+                global_cache_shape if not self.is_sliding[i] else sliding_cache_shape
+            )
+            new_layer_key_cache = torch.zeros(
+                cache_shape, dtype=self.dtype, device=device
+            )
+            new_layer_value_cache = torch.zeros(
+                cache_shape, dtype=self.dtype, device=device
+            )
+            torch._dynamo.mark_static_address(new_layer_key_cache)
+            torch._dynamo.mark_static_address(new_layer_value_cache)
+            self.key_cache.append(new_layer_key_cache)
+            self.value_cache.append(new_layer_value_cache)
+    def _sliding_update(
+        self,
+        cache_position,
+        layer_idx,
+        key_states,
+        value_states,
+        k_out,
+        v_out,
+        max_cache_len,
+    ):
+        if cache_position.shape[0] > max_cache_len:
+            k_out = key_states[:, :, -max_cache_len:, :]
+            v_out = value_states[:, :, -max_cache_len:, :]
+            # Assumption: caches are all zeros at this point, `+=` is equivalent to `=` but compile-friendly
+            self.key_cache[layer_idx] += k_out
+            self.value_cache[layer_idx] += v_out
+            # we should return the whole states instead of k_out, v_out to take the whole prompt
+            # into consideration when building kv cache instead of just throwing away tokens outside of the window
+            return key_states, value_states
+        slicing = torch.ones(
+            max_cache_len, dtype=torch.long, device=value_states.device
+        ).cumsum(0)
+        cache_position = cache_position.clamp(0, max_cache_len - 1)
+        to_shift = cache_position >= max_cache_len - 1
+        indices = (slicing + to_shift[-1].int() - 1) % max_cache_len
+        k_out = k_out[:, :, indices]
+        v_out = v_out[:, :, indices]
+        k_out[:, :, cache_position] = key_states
+        v_out[:, :, cache_position] = value_states
+        # `_.zero()` followed by `+=` is equivalent `=`, but compile-friendly (without graph breaks due to assignment)
+        self.key_cache[layer_idx].zero_()
+        self.value_cache[layer_idx].zero_()
+        self.key_cache[layer_idx] += k_out
+        self.value_cache[layer_idx] += v_out
+        return k_out, v_out
+    def _static_update(
+        self,
+        cache_position,
+        layer_idx,
+        key_states,
+        value_states,
+        k_out,
+        v_out,
+        max_cache_len,
+    ):
+        k_out[:, :, cache_position] = key_states
+        v_out[:, :, cache_position] = value_states
+        self.key_cache[layer_idx] = k_out
+        self.value_cache[layer_idx] = v_out
+        return k_out, v_out
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor]:
+        cache_position = cache_kwargs.get("cache_position")
+        sliding_window = cache_kwargs.get("sliding_window")
+        k_out = self.key_cache[layer_idx]
+        v_out = self.value_cache[layer_idx]
+        if sliding_window:
+            update_fn = self._sliding_update
+        else:
+            update_fn = self._static_update
+        return update_fn(
+            cache_position,
+            layer_idx,
+            key_states,
+            value_states,
+            k_out,
+            v_out,
+            k_out.shape[2],
+        )
+    def get_max_length(self) -> Optional[int]:
+        # in theory there is no limit because the sliding window size is fixed
+        # no matter how long the sentence is
+        return self.max_cache_len
+    def get_seq_length(self, layer_idx: Optional[int] = 0):
+        # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
+        # limit the check to the first batch member and head dimension.
+        # TODO: deprecate this function in favor of `cache_position`
+        if layer_idx != 0:
+            raise ValueError(
+                "`get_seq_length` on `HybridCache` may get inconsistent results depending on the layer index. "
+                "Using the `layer_idx` argument is not supported."
+            )
+        return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
+    def reset(self):
+        """Resets the cache values while preserving the objects"""
+        for layer_idx in range(len(self.key_cache)):
+            # In-place ops prevent breaking the static address
+            self.key_cache[layer_idx].zero_()
+            self.value_cache[layer_idx].zero_()
+cache_dict = {
+    "dynamic": DynamicCache,
+    "static": StaticCache,
+    "sliding_window": SlidingWindowCache,
+    "hybrid": HybridCache,
+}
+def cache_factory(
+    cache_implementation: str,
+    config: LLMModelConfig,
+    batch_size: int,
+    max_cache_len: int,
+):
+    assert (
+        cache_implementation in cache_dict
+    ), f"Unknown cache type. {cache_implementation}"
+    logging.info(f"Use {cache_implementation} as cache implementation.")
+    if cache_implementation == "sliding_window":
+        assert hasattr(config, "sliding_window_")
+        max_cache_len = min(config.sliding_window_, max_cache_len)
+    return cache_dict[cache_implementation](
+        config=config,
+        batch_size=batch_size,
+        max_cache_len=max_cache_len,
+        device=config.device_,
+        dtype=config.dtype_,
+    )

c2cite/common/checkpoint.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from typing import Callable, Tuple
+import torch
+def pack_hook(to_offload: torch.Tensor) -> Tuple[torch.device, torch.Tensor]:
+    return to_offload.device, to_offload.to("cpu")
+def unpack_hook(to_offload_info: Tuple[torch.device, torch.Tensor]) -> torch.Tensor:
+    device, to_offload = to_offload_info
+    return to_offload.to(device)
+def CheckpointNoneFunction(run_function: Callable, *args):
+    return run_function(*args)
+def CheckpointOffloadFunction(run_function: Callable, *args):
+    with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
+        outputs = run_function(*args)
+    return outputs
+def CheckpointRecomputeFunction(run_function: Callable, *args):
+    return torch.utils.checkpoint.checkpoint(run_function, *args, use_reentrant=True)
+CHECKPOINT_CLASSES = {
+    "none": CheckpointNoneFunction,
+    "offload": CheckpointOffloadFunction,
+    "recompute": CheckpointRecomputeFunction,
+}

c2cite/common/config.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import copy
+import os
+from dataclasses import dataclass, field
+from typing import Callable, Dict, List, Optional, TypeAlias, Union
+import torch
+Tokens: TypeAlias = List[int]
+Labels: TypeAlias = List[int]
+Masks: TypeAlias = List[bool]
+Ground: TypeAlias = List[str]
+Citations: TypeAlias = List[str]
+Query: TypeAlias = List[str]
+@dataclass
+class Prompt:
+    instruction: str = None
+    input: str = None
+    label: str = None
+@dataclass
+class InputData:
+    inputs: List[Union[Prompt, List[str], str]] = None
+    prefix_length_: int = None
+    tokens: Optional[Tokens] = None
+    labels: Optional[Labels] = None
+    grounds: Optional[Ground] = None
+    citations: Optional[Citations] = None
+    citation_tokens: Optional[List] = None
+    citation_embeds: Optional[List] = None
+    query: Optional[Query] = None
+    token_len: Optional[int] = None
+    prompt: Optional[str] = None
+    prompt_len: Optional[int] = None
+    test_citations: Optional[Citations] = None
+@dataclass
+class LLMModelConfig:
+    name_or_path_: str = None
+    device_: str = None
+    dim_: int = None
+    head_dim_: int = None
+    intermediate_: int = None
+    n_heads_: int = None
+    n_kv_heads_: int = None
+    n_layers_: int = None
+    hidden_act_: str = None
+    hidden_dropout_: float = None
+    vocab_size_: int = None
+    pad_token_id_: int = None
+    rope_theta_: float = None
+    partial_rotary_factor_: float = None
+    max_seq_len_: int = None
+    # eager or flash_attn
+    attn_implementation_: str = "eager"
+    # data type
+    dtype_: torch.dtype = None
+@dataclass
+class LLMModelOutput:
+    adapter_name: str = None
+    logits: torch.Tensor = None
+    router_logits: torch.Tensor = None
+    loss: torch.Tensor = None
+    cite_flag: bool = False
+    aux_loss: torch.Tensor = None
+    # for internal use
+    batch_start_idx_: int = -1
+    batch_end_idx_: int = -1
+    loss_fn_: Callable = None
+@dataclass
+class LLMBatchConfig:
+    adapter_name_: str = ""
+    batch_start_idx_: int = -1
+    batch_end_idx_: int = -1
+def _efficient_operator_factory():
+    efficient_operator = os.getenv("MOE_PEFT_EVALUATE_MODE") is None
+    return efficient_operator
+@dataclass
+class LLMModelInput:
+    batch_configs_: List[LLMBatchConfig] = None
+    batch_tokens_: List[Tokens] = None
+    batch_labels_: List[Labels] = None
+    batch_grounds_: List[Ground] = None
+    batch_cites: List[List] = None
+    batch_cites_value: List[List] = None
+    batch_masks_: List[Masks] = None
+    batch_docs: List[str] = None
+    batch_prompt_len: List[int] = None
+    output_router_logits_: bool = True
+    gradient_checkpoint_: str = "none"
+    efficient_operator_: bool = field(default_factory=_efficient_operator_factory)
+    inference_mode_: bool = False
+@dataclass
+class AdapterConfig:
+    adapter_name: str = ""
+    task_name: str = "casual"
+    @staticmethod
+    def from_config(config: Dict[str, any]) -> "AdapterConfig":
+        return AdapterConfig(
+            adapter_name=config.get("name", None),
+            task_name=config.get("task_name", None),
+        )
+lora_target_modules = {
+    # LLaMA names
+    "q_proj": False,
+    "k_proj": False,
+    "v_proj": False,
+    "o_proj": False,
+    "gate_proj": False,
+    "down_proj": False,
+    "up_proj": False,
+    # Phi names
+    "q_proj": False,
+    "k_proj": False,
+    "v_proj": False,
+    "dense": False,
+    "fc1": False,
+    "fc2": False,
+    # Phi3 names
+    "qkv_proj": False,
+    "o_proj": False,
+    "gate_up_proj": False,
+    "down_proj": False,
+    # GLM names
+    "qkv_proj": False,
+    "dense": False,
+    "dense_h_to_4h": False,
+    "dense_4h_to_h": False,
+}
+@dataclass
+class LoraConfig(AdapterConfig):
+    # Weight-Decomposed Low-Rank Adaptation
+    use_dora_: bool = False
+    # Rank-Stabilized LoRA
+    # sets the adapter scaling factor to `alpha/math.sqrt(r)`
+    use_rslora_: bool = False
+    # can be original or gaussian
+    lora_init_: str = "original"
+    lora_r_: int = None
+    lora_alpha_: int = None
+    lora_dropout_: float = None
+    target_modules_: Dict[str, bool] = None
+    atten_coin: float = None
+    router_coin: float = None
+    cite_coin:float = None
+    learning_rate: float = None
+    def check(self) -> "LoraConfig":
+        assert isinstance(self.use_dora_, bool)
+        assert isinstance(self.use_rslora_, bool)
+        assert isinstance(self.lora_init_, str) and self.lora_init_ in [
+            "original",
+            "gaussian",
+        ]
+        assert isinstance(self.lora_r_, int) and self.lora_r_ > 0
+        assert isinstance(self.lora_alpha_, int) and self.lora_alpha_ > 0
+        assert isinstance(self.lora_dropout_, float) and self.lora_dropout_ >= 0
+        assert isinstance(self.target_modules_, Dict)
+        for key, value in self.target_modules_.items():
+            assert isinstance(key, str) and len(key) > 0
+            assert isinstance(value, bool)
+        return self
+    @staticmethod
+    def from_config(config: Dict[str, any]) -> "LoraConfig":
+        lora_config = LoraConfig(**AdapterConfig.from_config(config).__dict__)
+        lora_config.use_dora_ = config.get("use_dora", False)
+        lora_config.use_rslora_ = config.get("use_rslora", False)
+        lora_config.lora_init_ = config.get("lora_init", "original")
+        lora_config.lora_r_ = config["r"]
+        lora_config.lora_alpha_ = config["lora_alpha"]
+        lora_config.lora_dropout_ = config["lora_dropout"]
+        lora_config.target_modules_ = copy.deepcopy(lora_target_modules)
+        lora_config.atten_coin = config["atten_mat_coin"]
+        lora_config.router_coin = config["router_coin"]
+        lora_config.cite_coin = config["cite_coin"]
+        lora_config.learning_rate = config["lr"]
+        if isinstance(config["target_modules"], List):
+            for target in config["target_modules"]:
+                if target in lora_target_modules:
+                    lora_config.target_modules_[target] = True
+        elif isinstance(config["target_modules"], Dict):
+            for target, value in config["target_modules"].items():
+                if target in lora_target_modules:
+                    lora_config.target_modules_[target] = value
+        else:
+            raise ValueError("broken config item: target_modules")
+        return lora_config
+    def export(self) -> Dict[str, any]:
+        config = {}
+        if self.use_dora_:
+            config["use_dora"] = True
+        if self.use_rslora_:
+            config["use_rslora"] = True
+        config["bias"] = "none"
+        config["peft_type"] = "LORA"
+        config["r"] = self.lora_r_
+        config["lora_alpha"] = self.lora_alpha_
+        config["lora_dropout"] = self.lora_dropout_
+        tgt_list = []
+        for target, value in self.target_modules_.items():
+            if value:
+                tgt_list.append(target)
+        config["target_modules"] = tgt_list
+        config["atten_mat_coin"] = self.atten_coin
+        config["router_coin"] = self.router_coin
+        config["cite_coin"] = self.cite_coin
+        config["lr"] = self.learning_rate
+        return config

c2cite/common/feed_forward.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from typing import Dict, List, Tuple
+import torch
+from moe_peft.executors import executor
+from .abstracts import LLMFeedForward, LLMMoeBlock
+from .config import LLMModelInput
+from .lora_linear import Linear, get_range_tensor
+class FeedForward(torch.nn.Module):
+    def __init__(self, mlp: LLMFeedForward) -> None:
+        super().__init__()
+        self.mlp_: LLMFeedForward = mlp
+        # mix of experts
+        self.moes_: Dict[str, LLMMoeBlock] = {}
+    def state_dict(self) -> Dict[str, Linear]:
+        return self.mlp_.state_dict()
+    def forward(
+        self, data: torch.Tensor, input_args: LLMModelInput
+    ) -> Tuple[torch.Tensor, List]:
+        if len(self.moes_) == 0:
+            return self.mlp_._batch_forward(data, input_args)
+        else:
+            return self._moe_forward(data, input_args)
+    def _moe_forward(self, data: torch.Tensor, input_args: LLMModelInput):
+        final_hidden_states = executor.init_tensor(data)
+        if input_args.output_router_logits_:
+            router_logits = [None for _ in range(len(input_args.batch_configs_))]
+        else:
+            router_logits = []
+        lora_range = get_range_tensor(data.device, data.shape[0])
+        for idx, lora_config in enumerate(input_args.batch_configs_):
+            moe_name = lora_config.adapter_name_
+            start_idx = lora_config.batch_start_idx_
+            end_idx = lora_config.batch_end_idx_
+            if moe_name in self.moes_:
+                current_hidden_states, current_router_outputs = self.moes_[
+                    moe_name
+                ].forward(
+                    hidden_states=data[start_idx:end_idx],
+                    ffn_layer=self.mlp_,
+                    input_args=input_args,
+                )
+                if (
+                    input_args.output_router_logits_
+                    and current_router_outputs is not None
+                ):
+                    router_logits[idx] = current_router_outputs
+            else:
+                current_hidden_states = self.mlp_._lora_forward(
+                    moe_name, self.mlp_.act_, data[start_idx:end_idx]
+                )
+            executor.index_copy(
+                final_hidden_states,
+                0,
+                lora_range[start_idx:end_idx],
+                current_hidden_states,
+            )
+        return final_hidden_states, router_logits

c2cite/common/lora_linear.py ADDED Viewed

	@@ -0,0 +1,511 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.utils import is_bitsandbytes_available
+from moe_peft.executors import executor
+from .abstracts import LLMMoeBlock
+from .config import LLMModelInput, LoraConfig
+if is_bitsandbytes_available():
+    import bitsandbytes as bnb
+    from bitsandbytes.nn import Linear4bit, Linear8bitLt
+else:
+    from moe_peft.utils import Linear8bitLt, Linear4bit
+from typing import Any, Dict, List, Tuple
+def dequantize_bnb_weight(weight: torch.nn.Parameter, state=None):
+    # BNB requires CUDA weights
+    device = weight.device
+    is_cpu = device.type == torch.device("cpu").type
+    if is_cpu:
+        weight = weight.to(torch.device("cuda"))
+    cls_name = weight.__class__.__name__
+    if cls_name == "Params4bit":
+        dequantized = bnb.functional.dequantize_4bit(weight.data, weight.quant_state)
+        if is_cpu:
+            dequantized = dequantized.to(device)
+        return dequantized
+    if state.SCB is None:
+        state.SCB = weight.SCB
+    im = torch.eye(weight.data.shape[-1]).contiguous().half().to(weight.device)
+    im, imt, SCim, SCimt, coo_tensorim = bnb.functional.double_quant(im)
+    im, Sim = bnb.functional.transform(im, "col32")
+    if state.CxB is None:
+        state.CxB, state.SB = bnb.functional.transform(
+            weight.data, to_order=state.formatB
+        )
+    out32, Sout32 = bnb.functional.igemmlt(im, state.CxB, Sim, state.SB)
+    dequantized = bnb.functional.mm_dequant(
+        out32, Sout32, SCim, state.SCB, bias=None
+    ).t()
+    if is_cpu:
+        dequantized = dequantized.to(device)
+    return dequantized
+def dequantize_module_weight(module: torch.nn.Module) -> torch.nn.Parameter:
+    if hasattr(module, "W_q"):  # For handling HQQ quantized weight
+        weight = module.dequantize()
+        return weight
+    weight = module.weight
+    if not isinstance(weight, torch.nn.Parameter):
+        raise TypeError(
+            f"Input weight should be of type nn.Parameter, got {type(weight)} instead"
+        )
+    cls_name = weight.__class__.__name__
+    if cls_name not in ("Params4bit", "Int8Params"):
+        return weight
+    quant_state = getattr(module, "state", None)
+    device = weight.device
+    is_cpu = device.type == torch.device("cpu").type
+    weight = dequantize_bnb_weight(weight, state=quant_state)  # no-op if not bnb
+    if is_cpu:
+        # dequantize_bnb_weight for 8bit moves the device in-place, thus we need to move it back to CPU if necessary
+        module.weight = module.weight.to(device)
+    return weight
+g_cached_range_tensor: Dict[torch.device, torch.Tensor] = {}
+# also max batch size
+g_max_range = 128
+def get_range_tensor(device: torch.device, batch_size: int = 1024):
+    global g_cached_range_tensor
+    global g_max_range
+    if device not in g_cached_range_tensor or batch_size > g_max_range:
+        g_max_range = g_max_range if g_max_range > batch_size else batch_size
+        g_cached_range_tensor[device] = torch.arange(
+            0, g_max_range, step=1, device=device
+        )
+    return g_cached_range_tensor[device]
+class LoraFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        result: torch.Tensor,
+        data: torch.Tensor,
+        input_args: LLMModelInput,
+        dropouts: List[float],
+        scalings: List[float],
+        *args,
+    ):
+        # the lora module is f32 precision
+        data = data.to(torch.float32)
+        save_inputs: Tuple[torch.Tensor | None, ...] = (data,)
+        lora_range = get_range_tensor(data.device, data.shape[0])
+        for lora_a, lora_b, lora_config, dropout, scaling in zip(
+            args[::2],
+            args[1::2],
+            input_args.batch_configs_,
+            dropouts,
+            scalings,
+        ):
+            assert not ((lora_a is None) ^ (lora_b is None))
+            if lora_a is None and lora_b is None:
+                save_inputs += (None, None, None)
+                continue
+            assert not ((lora_a.requires_grad) ^ (lora_b.requires_grad))
+            if not lora_a.requires_grad and not lora_b.requires_grad:
+                save_inputs += (None, None, None)
+                continue
+            start_idx = lora_config.batch_start_idx_
+            end_idx = lora_config.batch_end_idx_
+            # must ensure the dropout is not zero
+            # is dropout == 0, dropdata is a data's referece, so the data will be changed
+            assert dropout > 0.0
+            drop_data = F.dropout(data[start_idx:end_idx], p=dropout)
+            drop_data.mul_(scaling)
+            drop_data = drop_data @ lora_a.transpose(0, 1)
+            lora_data = drop_data @ lora_b.transpose(0, 1)
+            lora_data = lora_data.to(result.dtype)
+            result.index_add_(0, lora_range[start_idx:end_idx], lora_data)
+            save_inputs += (lora_a, lora_b, drop_data)
+        ctx.input_args = input_args
+        ctx.dropouts = dropouts
+        ctx.scalings = scalings
+        ctx.save_for_backward(*save_inputs)
+        return result
+    @staticmethod
+    def backward(ctx: Any, *grad_outputs: Any) -> Any:
+        grad_output: torch.Tensor = grad_outputs[0]
+        grad_result = None
+        grad_data: torch.Tensor | None = None
+        grad_input_args = None
+        grad_dropouts = None
+        grad_scalings = None
+        grad_loras: Tuple[torch.Tensor | None, ...] = ()
+        data, *loras = ctx.saved_tensors
+        if ctx.needs_input_grad[0]:
+            grad_result = grad_output
+        if ctx.needs_input_grad[1]:
+            grad_data = executor.init_tensor(data)
+        # the lora module is fp32 precision
+        grad_output = grad_output.to(torch.float32)
+        lora_range = get_range_tensor(
+            grad_output.device, batch_size=grad_output.shape[0]
+        )
+        for lora_a, lora_b, drop_data, dropout, scaling, lora_config in zip(
+            loras[::3],
+            loras[1::3],
+            loras[2::3],
+            ctx.dropouts,
+            ctx.scalings,
+            ctx.input_args.batch_configs_,
+        ):
+            start_idx = lora_config.batch_start_idx_
+            end_idx = lora_config.batch_end_idx_
+            assert not ((lora_a is None) ^ (lora_b is None))
+            if lora_a is None and lora_b is None:
+                grad_loras += (None, None)
+                if grad_data is not None:
+                    executor.index_fill(grad_data, 0, lora_range[start_idx:end_idx], 0)
+                continue
+            # lora_data shape is batch_size * seq_len * in_dim
+            lora_data = data[start_idx:end_idx]
+            # grad_y shape is batch_size * seq_len * out_dim
+            grad_y = grad_output[start_idx:end_idx]
+            # drop_data shape is batch_size * seq_len * r
+            # bstage shape is batch_size * seq_len * r
+            bstage = grad_y @ lora_b
+            bstage *= scaling / (1 - dropout)
+            grad_a = torch.sum(bstage.transpose(1, 2) @ lora_data, dim=0)
+            grad_b = torch.sum(grad_y.transpose(1, 2) @ drop_data, dim=0)
+            grad_loras += (grad_a, grad_b)
+            # grad_data shape is batch_size * seq_len * in_dim
+            if grad_data is not None:
+                grad_x = bstage @ lora_a
+                executor.index_copy(grad_data, 0, lora_range[start_idx:end_idx], grad_x)
+        return (
+            grad_result,
+            grad_data,
+            grad_input_args,
+            grad_dropouts,
+            grad_scalings,
+            *grad_loras,
+        )
+class Lora(nn.Module):
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        shape: Tuple[int, int],
+        config: LoraConfig,
+        device: str,
+    ):
+        super().__init__()
+        self.base_layer_ = base_layer
+        self.device_ = torch.device(device)
+        self.initializer_ = config.lora_init_
+        self.r_ = config.lora_r_
+        self.alpha_ = config.lora_alpha_
+        if config.use_rslora_:
+            self.scaling_ = self.alpha_ / math.sqrt(self.r_)
+        else:
+            self.scaling_ = self.alpha_ / self.r_
+        self.in_features_, self.out_features_ = shape
+        assert config.lora_dropout_ > 0.0
+        self.dropout_ = nn.Dropout(p=config.lora_dropout_)
+        self.lora_a_ = nn.Linear(
+            self.in_features_,
+            self.r_,
+            bias=False,
+            dtype=torch.float32,
+            device=self.device_,
+        )
+        self.lora_b_ = nn.Linear(
+            self.r_,
+            self.out_features_,
+            bias=False,
+            dtype=torch.float32,
+            device=self.device_,
+        )
+        self.use_dora_: bool = config.use_dora_
+        self.magnitude_vector_: nn.Parameter = None
+    def _get_weight_norm(self, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+        # calculate L2 norm of weight matrix, column-wise
+        weight = dequantize_module_weight(self.base_layer_).to(dtype)
+        lora_weight = self.lora_b_.weight @ self.lora_a_.weight
+        weight = weight + self.scaling_ * lora_weight
+        weight_norm = torch.linalg.norm(weight, dim=1).to(weight.dtype)
+        return weight_norm
+    def reset_parameters(self, lora_tensor=(None, None)) -> None:
+        # if the lora_tensor is not (None, None), use it to init the lora weight
+        assert isinstance(lora_tensor, Tuple)
+        assert len(lora_tensor) == 2
+        assert ((lora_tensor[0] is None) and (lora_tensor[1] is None)) or (
+            isinstance(lora_tensor[0], torch.Tensor)
+            and isinstance(lora_tensor[1], torch.Tensor)
+        )
+        if lora_tensor == (None, None):
+            if self.initializer_ == "original":
+                nn.init.kaiming_uniform_(self.lora_a_.weight, a=math.sqrt(5))
+            elif self.initializer_ == "gaussian":
+                nn.init.normal_(self.lora_a_.weight, std=1 / self.r_)
+            else:
+                raise ValueError(f"Unknown initialization {self.initializer_}")
+            nn.init.zeros_(self.lora_b_.weight)
+        else:
+            with torch.no_grad():
+                self.lora_a_.weight.copy_(lora_tensor[0])
+                self.lora_b_.weight.copy_(lora_tensor[1])
+        if self.use_dora_:
+            self.magnitude_vector_ = nn.Parameter(
+                self._get_weight_norm(), requires_grad=True
+            )
+    def apply_dora(
+        self,
+        residual: torch.Tensor,
+        result_lora: torch.Tensor,
+    ):
+        weight_norm = self._get_weight_norm().detach()
+        mag_norm_scale = (self.magnitude_vector_ / weight_norm).view(1, -1)
+        return mag_norm_scale * residual + mag_norm_scale * result_lora
+    def lora_forward(self, hidden_states: torch.Tensor):
+        return (
+            self.lora_b_(self.lora_a_(self.dropout_(hidden_states.to(torch.float32))))
+            * self.scaling_
+        )
+    def forward(
+        self,
+        residual: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        result_lora = self.lora_forward(hidden_states)
+        if self.use_dora_:
+            return self.apply_dora(residual, result_lora).to(residual.dtype)
+        else:
+            return residual + result_lora.to(residual.dtype)
+class Linear(nn.Module):
+    def __init__(self, base_layer: nn.Module, device: str):
+        super().__init__()
+        if not isinstance(base_layer, nn.Linear):
+            assert isinstance(base_layer, Linear8bitLt) or isinstance(
+                base_layer, Linear4bit
+            ), f"error type - {type(base_layer)}."
+        else:
+            base_layer.requires_grad_(False)
+        self.device_ = torch.device(device)
+        self.base_layer_ = base_layer.to(self.device_)
+        self.loras_: Dict[str, Lora] = {}
+        self.moes_: Dict[str, LLMMoeBlock] = {}
+        if isinstance(self.base_layer_, Linear4bit):
+            self.out_features_, self.in_features_ = (
+                self.base_layer_.out_features,
+                self.base_layer_.in_features,
+            )
+        else:
+            self.out_features_, self.in_features_ = self.base_layer_.weight.shape
+    def init_lora_weight(
+        self, lora_config: LoraConfig, lora_tensor=(None, None), adapter_name=None
+    ):
+        if adapter_name is None:
+            adapter_name = lora_config.adapter_name
+        if adapter_name not in self.loras_:
+            self.loras_[adapter_name] = Lora(
+                self.base_layer_,
+                (self.in_features_, self.out_features_),
+                lora_config,
+                self.device_,
+            )
+        self.loras_[adapter_name].reset_parameters(lora_tensor)
+    def _appy_dora(
+        self,
+        residual: torch.Tensor,
+        lora_delta: torch.Tensor,
+        input_args: LLMModelInput,
+    ):
+        next_states = executor.init_tensor(residual)
+        lora_range = get_range_tensor(
+            next_states.device, batch_size=next_states.shape[0]
+        )
+        for lora_config in input_args.batch_configs_:
+            adapter_name = lora_config.adapter_name_
+            start_idx = lora_config.batch_start_idx_
+            end_idx = lora_config.batch_end_idx_
+            if adapter_name == "" or adapter_name not in self.loras_:
+                continue
+            if self.loras_[adapter_name].use_dora_:
+                lora_data = self.loras_[adapter_name].apply_dora(
+                    residual[start_idx:end_idx],
+                    lora_delta[start_idx:end_idx],
+                )
+            else:
+                lora_data = residual[start_idx:end_idx] + lora_delta[start_idx:end_idx]
+            executor.index_copy(
+                next_states, 0, lora_range[start_idx:end_idx], lora_data
+            )
+        return next_states
+    def _efficient_impl(
+        self, hidden_states: torch.Tensor, input_args: LLMModelInput
+    ) -> torch.Tensor:
+        # hidden_states shape is: batch_size * max_seq_len * dim
+        # result = hidden_states @ self.weight_.transpose(0, 1)
+        residual = self.base_layer_.forward(hidden_states)
+        if len(self.loras_) == 0:
+            return residual
+        # split the data and result
+        dropouts: List[float] = []
+        scalings: List[float] = []
+        loras: Tuple[torch.Tensor] = ()
+        for lora_config in input_args.batch_configs_:
+            adapter_name = lora_config.adapter_name_
+            if adapter_name not in self.loras_:
+                loras += (None, None)
+                dropouts.append(None)
+                scalings.append(None)
+                continue
+            loras += (
+                self.loras_[adapter_name].lora_a_.weight,
+                self.loras_[adapter_name].lora_b_.weight,
+            )
+            dropouts.append(self.loras_[adapter_name].dropout_.p)
+            scalings.append(self.loras_[adapter_name].scaling_)
+        have_dora = any(lora.use_dora_ for lora in self.loras_.values())
+        if have_dora:
+            lora_delta = torch.zeros_like(residual, dtype=torch.float32)
+            lora_delta = LoraFunction.apply(
+                lora_delta,
+                hidden_states.to(torch.float32),
+                input_args,
+                dropouts,
+                scalings,
+                *loras,
+            )
+            next_states = self._appy_dora(
+                residual.to(torch.float32), lora_delta, input_args
+            )
+        else:
+            next_states = LoraFunction.apply(
+                residual.to(torch.float32),
+                hidden_states.to(torch.float32),
+                input_args,
+                dropouts,
+                scalings,
+                *loras,
+            )
+        return next_states.to(hidden_states.dtype)
+    def _compatible_impl(
+        self, hidden_states: torch.Tensor, input_args: LLMModelInput
+    ) -> torch.Tensor:
+        # hidden_states shape is: batch_size * max_seq_len * dim
+        # result = hidden_states @ self.weight_.transpose(0, 1)
+        residual = self.base_layer_.forward(hidden_states)
+        if len(self.loras_) == 0:
+            return residual
+        next_states = executor.init_tensor(residual)
+        lora_range = get_range_tensor(hidden_states.device, hidden_states.shape[0])
+        for lora_config in input_args.batch_configs_:
+            adapter_name = lora_config.adapter_name_
+            start_idx = lora_config.batch_start_idx_
+            end_idx = lora_config.batch_end_idx_
+            if adapter_name in self.loras_:
+                fwd_fn = self.loras_[adapter_name].forward
+                kwargs = {}
+            elif adapter_name in self.moes_:
+                fwd_fn = self.moes_[adapter_name].forward
+                kwargs = {"lora_linear": self}
+            else:
+                executor.index_copy(
+                    next_states,
+                    0,
+                    lora_range[start_idx:end_idx],
+                    residual[start_idx:end_idx],
+                )
+                continue
+            lora_data = fwd_fn(
+                residual=residual[start_idx:end_idx],
+                hidden_states=hidden_states[start_idx:end_idx],
+                **kwargs,
+            )
+            executor.index_copy(
+                next_states, 0, lora_range[start_idx:end_idx], lora_data
+            )
+        return next_states
+    def forward(
+        self, hidden_states: torch.Tensor, input_args: LLMModelInput
+    ) -> torch.Tensor:
+        if input_args.efficient_operator_ and len(self.moes_) == 0:
+            return self._efficient_impl(hidden_states, input_args)
+        else:
+            return self._compatible_impl(hidden_states, input_args)

c2cite/common/moe_utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import copy
+from typing import List, Optional
+import torch
+from .abstracts import LLMDecoder, LLMModelInput
+def slice_tensor(
+    data: torch.Tensor,
+    slice: torch.Tensor,
+    dtype: torch.dtype,
+    last_value: Optional[torch.Tensor] = None,
+):
+    if last_value is None:
+        # for macOS debugging, please uncomment this line
+        # assert data.dtype in (torch.float, torch.int, torch.bool)
+        return data[None, slice].reshape(-1, data.shape[-1]).to(dtype)
+    else:
+        return last_value
+def unpack_router_logits(gate_logits: List[torch.Tensor]) -> torch.Tensor:
+    compute_device = gate_logits[0].device
+    concatenated_gate_logits = torch.cat(
+        [layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0
+    )
+    return concatenated_gate_logits
+def collect_plugin_router_logtis(
+    router_logits, input_args: LLMModelInput, decoder_layer: LLMDecoder
+):
+    if router_logits is None or len(router_logits) == 0:
+        router_logits = [None for _ in range(len(input_args.batch_configs_))]
+    attn_proj, mlp_proj = decoder_layer.state_dict()
+    all_proj = copy.copy(attn_proj)
+    all_proj.update(mlp_proj)
+    for idx, config in enumerate(input_args.batch_configs_):
+        if router_logits[idx] is not None:
+            continue
+        adapter_name = config.adapter_name_
+        for proj in all_proj.values():
+            if adapter_name in proj.moes_ and hasattr(
+                proj.moes_[adapter_name], "router_logits_"
+            ):
+                if router_logits[idx] is None:
+                    router_logits[idx] = []
+                router_logits[idx].append(proj.moes_[adapter_name].router_logits_)
+                proj.moes_[adapter_name].router_logits_ = None
+    for idx, logits in enumerate(router_logits):
+        if isinstance(logits, list):
+            router_logits[idx] = torch.cat(logits, 0)
+    return router_logits

c2cite/common/rope.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import math
+from typing import Optional, Tuple
+import torch
+from .config import LLMModelConfig
+def _compute_default_rope_parameters(
+    config: Optional[LLMModelConfig] = None,
+    device: Optional[torch.device] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> Tuple[torch.Tensor, float]:
+    if len(rope_kwargs) > 0:
+        base = rope_kwargs["base"]
+        dim = rope_kwargs["dim"]
+    elif config is not None:
+        base = config.rope_theta_
+        partial_rotary_factor = (
+            config.partial_rotary_factor_
+            if config.partial_rotary_factor_ is not None
+            else 1.0
+        )
+        head_dim = (
+            config.dim_ // config.n_heads_
+            if config.head_dim_ is None
+            else config.head_dim_
+        )
+        dim = int(head_dim * partial_rotary_factor)
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # Compute the inverse frequencies
+    inv_freq = 1.0 / (
+        base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)
+    )
+    return inv_freq, attention_factor
+def _compute_llama3_parameters(
+    config: LLMModelConfig,
+    device: torch.device,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> Tuple[torch.Tensor, float]:
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(
+        config, device, seq_len, **rope_kwargs
+    )
+    factor = config.rope_scaling_["factor"]  # `8` in the original implementation
+    low_freq_factor = config.rope_scaling_[
+        "low_freq_factor"
+    ]  # `1` in the original implementation
+    high_freq_factor = config.rope_scaling_[
+        "high_freq_factor"
+    ]  # `4` in the original implementation
+    old_context_len = config.rope_scaling_[
+        "original_max_position_embeddings"
+    ]  # `8192` in the original implementation
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    wavelen = 2 * math.pi / inv_freq
+    # wavelen < high_freq_wavelen: do nothing
+    # wavelen > low_freq_wavelen: divide by factor
+    inv_freq_llama = torch.where(
+        wavelen > low_freq_wavelen, inv_freq / factor, inv_freq
+    )
+    # otherwise: interpolate between the two, using a smooth factor
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (
+        high_freq_factor - low_freq_factor
+    )
+    smoothed_inv_freq = (
+        1 - smooth_factor
+    ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+    return inv_freq_llama, attention_factor
+ROPE_INIT_FUNCTIONS = {
+    "default": _compute_default_rope_parameters,
+    "llama3": _compute_llama3_parameters,
+}

c2cite/dispatcher.py ADDED Viewed

	@@ -0,0 +1,378 @@

+import logging
+import random
+import sys
+from abc import abstractmethod
+from typing import Callable, Dict, List
+import datasets
+import copy
+from .common import InputData, LLMBatchConfig, LLMModelInput, Masks, Tokens
+from .tokenizer import Tokenizer
+class Event:
+    __callback_list: List[Callable] = None
+    def __init__(self):
+        self.__callback_list = []
+    def register(self, func: Callable) -> "Event":
+        self.__callback_list = [func] + self.__callback_list
+        return self
+    def activate(self, **kwargs) -> bool:
+        for func in self.__callback_list:
+            if func(**kwargs):
+                return True
+        return False
+def load_dataset(data_path: str):
+    if data_path.endswith(".json") or data_path.endswith(".jsonl"):
+        return datasets.load_dataset("json", data_files=data_path)
+    else:
+        if ":" in data_path:
+            result = data_path.split(":")
+            return datasets.load_dataset(result[0], result[1])
+        else:
+            return datasets.load_dataset(data_path)
+class TrainTask:
+    tokenizer_: Tokenizer = None
+    adapter_name_: str = ""
+    data_path_: str = ""
+    dataload_function_: Callable = None
+    train_token_data_: List[InputData] = None
+    # train parameter
+    total_epoch_num_: int = -1
+    max_train_batch_size_: int = -1
+    max_train_micro_batch_size_: int = -1
+    max_test_batch_size_: int = -1
+    train_cutoff_len_: int = -1
+    group_by_length_: bool = False
+    # count the stat of train and test data
+    epoch_cnt_: int = 1
+    next_train_data_start_idx_: int = 0
+    next_test_data_start_idx_: int = 0
+    def __init__(
+        self,
+        tokenzer: Tokenizer,
+        adapter_name: str,
+        dataload_function: Callable,
+        total_epoch_num: int,
+        max_train_batch_size: int,
+        max_train_micro_batch_size: int,
+        train_cutoff_len: int = 256,
+        group_by_length: bool = True,
+    ):
+        self.tokenizer_ = tokenzer
+        self.adapter_name_ = adapter_name
+        self.dataload_function_ = dataload_function
+        self.total_epoch_num_ = total_epoch_num
+        self.max_train_batch_size_ = max_train_batch_size
+        self.max_train_micro_batch_size_ = max_train_micro_batch_size
+        self.train_cutoff_len_ = train_cutoff_len
+        self.group_by_length_ = group_by_length
+    def load_data(self):
+        self.train_token_data_ = self.dataload_function_(self.tokenizer_)
+        max_train_tokens_len = 0
+        for data in self.train_token_data_:
+            max_train_tokens_len = max(max_train_tokens_len, len(data.tokens))
+            if len(data.tokens) > self.train_cutoff_len_:
+                data.tokens = data.tokens[: self.train_cutoff_len_]
+        logging.info(
+            f"Max train tokens length: {max_train_tokens_len}/{self.train_cutoff_len_}"
+        )
+        if self.group_by_length_:
+            self.train_token_data_.sort(key=lambda x: len(x.tokens), reverse=True)
+        else:
+            random.shuffle(self.train_token_data_)
+    def is_train_done(self):
+        if self.epoch_cnt_ <= self.total_epoch_num_:
+            return False
+        return True
+    def is_test_done(self):
+        if self.next_test_data_start_idx_ < len(self.test_token_data_):
+            return False
+        return True
+    def reset_test_status(self):
+        self.next_test_data_start_idx_ = 0
+    # reentry function
+    def get_train_deta_max_seq_len(self) -> int:
+        start_idx = self.next_train_data_start_idx_
+        assert start_idx < len(self.train_token_data_)
+        # in this strategy must sort
+        return len(self.train_token_data_[start_idx].tokens)
+    # non reentry function
+    def get_train_data(self) -> List[InputData]:
+        start_idx = self.next_train_data_start_idx_
+        end_idx = start_idx + self.max_train_micro_batch_size_
+        ret_data = self.train_token_data_[start_idx:end_idx]
+        logging.info(f"{self.adapter_name_} train data:")
+        logging.info(
+            f"    epoch: {self.epoch_cnt_}/{self.total_epoch_num_} \
+            step in epoch: {start_idx}/{len(self.train_token_data_)}"
+        )
+        self.next_train_data_start_idx_ += self.max_train_micro_batch_size_
+        if self.next_train_data_start_idx_ >= len(self.train_token_data_):
+            self.next_train_data_start_idx_ = 0
+            self.epoch_cnt_ += 1
+        return ret_data
+class DispatcherConfig:
+    @abstractmethod
+    def dispatcher_context(self) -> Dict[str, any]:
+        return {}
+class Dispatcher:
+    config_ = None
+    tokenizer_: Tokenizer = None
+    # all train task
+    ready_train_task_: List[TrainTask] = None
+    running_train_task_: List[TrainTask] = None
+    done_train_task_: List[TrainTask] = None
+    # train task in event
+    train_task_in_event_: Event = None
+    train_task_out_event_: Event = None
+    # the number of max candidate training lora model
+    # can chose train data from this dataset
+    train_lora_candidate_num_: int = 0
+    # the number of simultaneously train lora model
+    train_lora_simultaneously_num_: int = 0
+    strategy_: str = ""
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        configs: List[DispatcherConfig],
+        max_concurrent_jobs: int = None,
+        strategy: str = "optim",
+        cutoff_len: int = 256,
+    ) -> None:
+        if max_concurrent_jobs is None:
+            max_concurrent_jobs = len(configs)
+        self.tokenizer_ = tokenizer
+        self.ready_train_task_ = []
+        self.running_train_task_ = []
+        self.done_train_task_ = []
+        self.train_task_in_event_ = Event()
+        self.train_task_out_event_ = Event()
+        self.train_lora_candidate_num_ = sys.maxsize
+        self.train_lora_simultaneously_num_ = max_concurrent_jobs
+        self.strategy_ = strategy
+        # create ready task
+        for config_class in configs:
+            kwargs = config_class.dispatcher_context()
+            self.ready_train_task_.append(
+                TrainTask(
+                    tokenzer=self.tokenizer_, train_cutoff_len=cutoff_len, **kwargs
+                )
+            )
+    def optim_dispatch_strategy(self) -> Dict[str, List[InputData]]:
+        task_len = {}
+        for idx, task in enumerate(self.running_train_task_):
+            task_len[idx] = task.get_train_deta_max_seq_len()
+        # sort to get the seq most similar data
+        task_len = sorted(task_len.items(), key=lambda x: x[1], reverse=True)
+        # find the mini diff
+        min_need_pad_len = sys.maxsize
+        win_start_idx = 0
+        for sidx in range(0, len(task_len) - self.train_lora_simultaneously_num_ + 1):
+            win = task_len[sidx : sidx + self.train_lora_simultaneously_num_]
+            need_pad_len = 0
+            for i in range(1, len(win)):
+                # aligin to the max seq len
+                need_pad_len += abs(win[i][1] - win[0][1])
+            if need_pad_len < min_need_pad_len:
+                min_need_pad_len = need_pad_len
+                win_start_idx = sidx
+        # the result is win_start_idx
+        result_win = task_len[
+            win_start_idx : win_start_idx + self.train_lora_simultaneously_num_
+        ]
+        ret_train_data = {}
+        for result_task_len in result_win:
+            task_idx = result_task_len[0]
+            ret_train_data[self.running_train_task_[task_idx].adapter_name_] = (
+                self.running_train_task_[task_idx].get_train_data()
+            )
+        return ret_train_data
+    def none_dispatch_strategy(self) -> Dict[str, List[InputData]]:
+        ret_train_data = {}
+        cnt = 0
+        for task in self.running_train_task_:
+            assert not task.is_train_done()
+            if cnt >= self.train_lora_simultaneously_num_:
+                break
+            ret_train_data[task.adapter_name_] = task.get_train_data()
+            cnt += 1
+        return ret_train_data
+    def check_task_done(self) -> bool:
+        if len(self.ready_train_task_) == 0 and len(self.running_train_task_) == 0:
+            return True
+        return False
+    def check_test_done(self) -> bool:
+        for task in self.running_train_task_:
+            if task.is_train_done():
+                return False
+        return True
+    def reset_test_task(self):
+        for task in self.running_train_task_:
+            task.reset_test_status()
+    # ready task -> running task
+    def __dispatch_task_in(self):
+        assert len(self.running_train_task_) <= self.train_lora_candidate_num_
+        if len(self.running_train_task_) == self.train_lora_candidate_num_:
+            return
+        # chose task into running
+        while (
+            len(self.running_train_task_) < self.train_lora_candidate_num_
+            and len(self.ready_train_task_) > 0
+        ):
+            # TODO to dispatch task
+            task = self.ready_train_task_.pop(0)
+            # to lazy load data
+            task.load_data()
+            self.train_task_in_event_.activate(task=task)
+            self.running_train_task_.append(task)
+    # running task -> done task
+    def __dispatch_task_out(self):
+        for task in self.running_train_task_:
+            if task.is_train_done():
+                self.train_task_out_event_.activate(task=task)
+                self.done_train_task_.append(task)
+        self.running_train_task_ = [
+            task for task in self.running_train_task_ if not task.is_train_done()
+        ]
+    def get_test_data(self) -> LLMModelInput:
+        pass
+    def get_train_data(self) -> LLMModelInput:
+        self.__dispatch_task_in()
+        # get task train data
+        all_train_data: Dict[str, List[InputData]] = {}
+        if self.strategy_ == "none":
+            all_train_data = self.none_dispatch_strategy()
+        elif self.strategy_ == "optim":
+            all_train_data = self.optim_dispatch_strategy()
+        else:
+            raise "unkown strategy"
+        batch_seq_len: int = -1
+        # to align batch token data
+        for adapter in all_train_data:
+            for data in all_train_data[adapter]:
+                batch_seq_len = max(batch_seq_len, len(data.tokens))
+        # all prompts and tokens / config
+        batch_tokens: List[Tokens] = []
+        attention_masks: List[Masks] = []
+        batch_labels: List[List] = []
+        lora_batch_data_config: List[LLMBatchConfig] = []
+        cites = []
+        cites_value = []
+        docs = []
+        prompt_len = []
+        # batch the all adapter data
+        adapter_start_idx: int = 0
+        for adapter in all_train_data:
+            adapter_end_idx: int = adapter_start_idx + len(all_train_data[adapter])
+            for data in all_train_data[adapter]:
+                tokens: Tokens = data.tokens.copy()
+                #print(data.inputs)
+                #print("")
+                def condition(i):
+                    return (128010 <= i <= 128255) or i in {128004, 128002, 128003, 128005, 128008}
+                prompt_len.append(data.prompt_len)
+                cite = [index for index, value in enumerate(tokens) if condition(value)]
+                cite_value = [value for value in tokens if condition(value)]
+                assert len(cite) <40, print(f"too long!!! need:{len(cites)}")
+                if len(cite) > 0:
+                    if cite[len(cite) - 1] != data.token_len:
+                        cite.append(data.token_len)
+                pad_side = self.tokenizer_.padding_side_
+                assert pad_side == "right" or pad_side == "left"
+                # pad the tokens to align
+                while len(tokens) < batch_seq_len:
+                    if pad_side == "right":
+                        tokens.append(self.tokenizer_.pad_id_)
+                    else:
+                        tokens.insert(0, self.tokenizer_.pad_id_)
+                batch_tokens.append(tokens)
+                cites.append(cite.copy())
+                cites_value.append(cite_value.copy())
+                if data.citation_embeds == None:
+                    docs.append(data.citation_tokens)
+                else:
+                    docs.append(data.citation_embeds)
+                attention_masks.append(self.tokenizer_.mask_from(tokens))
+                labels = data.labels
+                if labels is None:
+                    labels = tokens.copy()
+                else:
+                    labels = labels.copy()
+                batch_labels.append(labels)
+            lora_batch_data_config.append(
+                LLMBatchConfig(
+                    adapter_name_=adapter,
+                    batch_start_idx_=adapter_start_idx,
+                    batch_end_idx_=adapter_end_idx,
+                )
+            )
+            adapter_start_idx = adapter_end_idx
+        self.__dispatch_task_out()
+        return LLMModelInput(
+            batch_cites = cites,
+            batch_cites_value=cites_value,
+            batch_docs = docs,
+            batch_prompt_len = prompt_len,
+            batch_configs_=lora_batch_data_config,
+            batch_tokens_=batch_tokens,
+            batch_labels_=batch_labels,
+            batch_masks_=attention_masks,
+            gradient_checkpoint_="recompute",
+        )

c2cite/evaluator.py ADDED Viewed

	@@ -0,0 +1,518 @@

+import json
+import logging
+import time
+import sys
+import os
+from dataclasses import dataclass
+from typing import Dict, List, Tuple, Union, Optional
+import torch
+from .adapters import MixLoraConfig
+from .common import InputData, LLMBatchConfig, LLMModelInput, Prompt, Tokens
+from .model import LLMModel
+from .tasks import BasicMetric, BasicTask, CommonSenseTask, task_dict
+from .tokenizer import Tokenizer
+from moe_peft.prompter import Prompter
+from moe_peft.generator import _batch_generate
+from moe_peft.solutions import get_output
+@dataclass
+class GenerateData:
+    adapter_name_: str = None
+    prompt_index_: int = None
+    prefix_length_: int = None
+    raw_tokens_: Tokens = None
+@dataclass
+class GenerateConfig:
+    adapter_name: str = None
+    prompts: List[Union[str, Tuple[str, str]]] = None
+    prompt_template: str = None
+    # Generate Arguments
+    batch_size: int = 8
+    stop_token: str = None
+    temperature: float = 1
+    top_p: float = 0.9
+    top_k: float = 50
+    do_sample: bool = True
+    repetition_penalty: float = 1.1
+    renormalize_logits: bool = True
+    # Do not set these manually
+    prompter_: Prompter = None
+    stop_token_: torch.Tensor = None
+    data_: List[GenerateData] = None
+    # Set prompt_template_ to enable the prompter
+    def generate_prompt(self, instruction: str, input: str = None) -> str:
+        if self.prompter_ is None:
+            self.prompter_ = Prompter(self.prompt_template)
+        return self.prompter_.generate_prompt(instruction=instruction, input=input)
+    def get_prompts(self) -> List[str]:
+        prompts = []
+        for prompt in self.prompts:
+            args = prompt if isinstance(prompt, Tuple) else (prompt, None)
+            prompts.append(self.generate_prompt(*args))
+        return prompts
+    def get_response(self, output: str) -> str:
+        if self.prompter_ is None:
+            return output.strip()
+        else:
+            return self.prompter_.get_response(output)
+    def reset_parameters(self):
+        self.prompter_ = Prompter(self.prompt_template)
+        self.stop_token_ = None
+        self.data_ = []
+@dataclass
+class EvaluateConfig:
+    adapter_name: str = None
+    task_name: str = None
+    data_path: str = None
+    batch_size: int = 16
+    router_profile: bool = False
+    # Do not set these manually
+    task_: BasicTask = None
+    data_: List[InputData] = None
+    metric_: BasicMetric = None
+    rollback_start_idx_: int = 0
+    batch_start_idx_: int = 0
+    batch_end_idx_: int = 0
+    def _dataload_fn(self, tokenizer: Tokenizer, **tokenizer_kwargs):
+        data = self.task_.loading_data(False, self.data_path)
+        for idx, data_point in enumerate(data):
+            assert not isinstance(data_point.inputs, Prompt)
+            data_point.tokens = tokenizer.encode(data_point.inputs, **tokenizer_kwargs)
+            data_point.prefix_length_ = len(data_point.tokens)
+            if data_point.citations is not None:
+                if data_point.citation_embeds is None:
+                    data_point.citation_tokens = [tokenizer.encode(c, **tokenizer_kwargs)
+                                        for c in data_point.citations]
+                else:
+                    data_point.citation_tokens = data_point.citation_embeds
+            if idx % 10000 == 0:
+                logging.info(f"Encode text data: {idx}/{len(data)}")
+        return data
+    @staticmethod
+    def from_config(config: Dict[str, any]) -> List["EvaluateConfig"]:
+        adapter_name = config["name"]
+        data_path = config.get("data", None)
+        task_list = config.get("task_name", "casual").split(";")
+        path_list = (
+            [None] * len(task_list) if data_path is None else data_path.split(";")
+        )
+        config_list = []
+        for task_name_, data_path_ in zip(task_list, path_list):
+            if task_name_ not in task_dict:
+                continue
+            config_list.append(
+                EvaluateConfig(
+                    adapter_name=adapter_name,
+                    task_name=task_name_,
+                    data_path=data_path_,
+                    batch_size=config["evaluate_batch_size"],
+                )
+            )
+        return config_list
+    def prepare(self, tokenizer: Tokenizer, device: str):
+        self.reset_parameters()
+        assert (
+            self.task_name != "casual"
+        ), "Auto evaluation is not currently available for casual supervised fine-tuning tasks."
+        self.task_ = task_dict[self.task_name]
+        self.data_ = self._dataload_fn(tokenizer)
+        self.metric_ = self.task_.loading_metric()
+        if isinstance(self.task_, CommonSenseTask):
+            labels = self.task_.label_list()
+            label_indices = [0] * len(labels)
+            for idx, label in enumerate(labels):
+                ids = tokenizer.encode(" " + label)
+                label_indices[idx] = ids[-1]
+            self.label_indices_ = torch.tensor(
+                label_indices, dtype=torch.int64, device=device
+            )
+        else:
+            self.label_indices_ = None
+    def reset_parameters(self):
+        self.task_ = None
+        self.data_ = None
+        self.metric_ = None
+        self.rollback_start_idx_ = 0
+        self.batch_start_idx_ = 0
+        self.batch_end_idx_ = 0
+def _prepare_tasks(model, tokenizer, configs):
+    for config in configs:
+        config.prepare(tokenizer, model.device_)
+        if not isinstance(model.adapter_configs_[config.adapter_name], MixLoraConfig):
+            continue
+        for layer in model.model_.layers_:
+            if config.adapter_name in layer.mlp_.moes_:
+                layer.mlp_.moes_[config.adapter_name].router_profile_ = (
+                    config.router_profile
+                )
+def _dispatch_task_in(tokenizer, configs, concurrent_jobs, max_seq_len):
+    batch_data_config = []
+    sequence_lengths = []
+    current_configs = []
+    batch_tokens = []
+    batch_labels = []
+    more_grounds = []
+    atten_masks = []
+    max_tokens_len = 0
+    for config in configs:
+        if len(current_configs) >= concurrent_jobs:
+            break
+        if config.batch_start_idx_ >= len(config.data_):
+            continue
+        config.batch_end_idx_ = min(
+            config.batch_start_idx_ + config.batch_size, len(config.data_)
+        )
+        batch_start_idx = len(batch_tokens)
+        for idx in range(config.batch_start_idx_, config.batch_end_idx_):
+            if idx >= len(config.data_):
+                break
+            tokens = config.data_[idx].tokens
+            labels = config.data_[idx].labels
+            grounds = config.data_[idx].grounds
+            if len(tokens) > max_seq_len:
+                tokens = tokens[:max_seq_len]
+            max_tokens_len = max(len(tokens), max_tokens_len)
+            batch_tokens.append(tokens)
+            if labels:
+                batch_labels.append([labels].copy())
+            if grounds:
+                more_grounds.append(grounds.copy())
+        config.batch_start_idx_ = config.batch_end_idx_
+        current_configs.append(config)
+        batch_data_config.append(
+            LLMBatchConfig(
+                adapter_name_=config.adapter_name,
+                batch_start_idx_=batch_start_idx,
+                batch_end_idx_=len(batch_tokens),
+            )
+        )
+    max_seq_len = min(max_seq_len, max_tokens_len)
+    for tokens in batch_tokens:
+        sequence_lengths.append(len(tokens) - 1)
+        while len(tokens) < max_seq_len:
+            tokens.append(tokenizer.pad_id_)
+        atten_masks.append(tokenizer.mask_from(tokens))
+    return (
+        current_configs,
+        sequence_lengths,
+        batch_labels,
+        more_grounds,
+        LLMModelInput(
+            batch_configs_=batch_data_config,
+            batch_tokens_=batch_tokens,
+            batch_masks_=atten_masks,
+            inference_mode_=True,
+        ),
+    )
+def _compute_metrcis(model, current_configs, sequence_lengths, batch_labels, outputs):
+    for idx, output in enumerate(outputs):
+        config: EvaluateConfig = current_configs[idx]
+        task: BasicTask = config.task_
+        metric: BasicMetric = config.metric_
+        start_idx = output.batch_start_idx_
+        end_idx = output.batch_end_idx_
+        logits = output.logits
+        if config.router_profile:
+            adapter_config = model.adapter_configs_[config.adapter_name]
+            if isinstance(adapter_config, MixLoraConfig):
+                router_statistic_ = list(0 for _ in range(adapter_config.num_experts_))
+                for layer in model.model_.layers_:
+                    if config.adapter_name not in layer.mlp_.moes_:
+                        continue
+                    for idx, val in enumerate(
+                        layer.mlp_.moes_[config.adapter_name].profiler_
+                    ):
+                        router_statistic_[idx] += val
+                for idx, val in enumerate(router_statistic_):
+                    logging.info(
+                        f"{config.adapter_name}: expert {idx}, load = {val/32}"
+                    )
+        batch_size = logits.shape[0]
+        pooled_logits = logits[
+            torch.arange(batch_size, device=logits.device),
+            sequence_lengths[start_idx:end_idx],
+        ]
+        labels = torch.tensor(
+            batch_labels[start_idx:end_idx],
+            dtype=task.label_dtype_,
+            device=logits.device,
+        )
+        if task.task_type_ == "common_sense":
+            pooled_logits = pooled_logits[:, config.label_indices_]
+            pooled_logits = pooled_logits.softmax(-1).argmax(-1)
+        elif task.task_type_ == "single_label_classification":
+            pooled_logits = pooled_logits.softmax(-1).argmax(-1)
+            pooled_logits = pooled_logits.to(task.label_dtype_)
+        elif task.task_type_ != "multi_label_classification":
+            raise ValueError(f"unknown task type {task.task_type_}")
+        metric.add_batch(
+            predictions=pooled_logits.detach().cpu(), references=labels.detach().cpu()
+        )
+        logging.info(f"{config.adapter_name} evaluate data:")
+        logging.info(f"    step: {config.batch_start_idx_}/{len(config.data_)}")
+def _compute_result(model, configs, save_file):
+    results = []
+    for config in configs:
+        result = {
+            "adapter_name": config.adapter_name,
+            "task_name": config.task_name,
+            "date_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+            "metrics": {},
+        }
+        compute_results = config.metric_.compute()
+        result["metrics"] = compute_results
+        if config.router_profile:
+            adapter_config = model.adapter_configs_[config.adapter_name]
+            if isinstance(adapter_config, MixLoraConfig):
+                router_statistic_ = list(0 for _ in range(adapter_config.num_experts_))
+                for layer in model.model_.layers_:
+                    if config.adapter_name not in layer.mlp_.moes_:
+                        continue
+                    for idx, val in enumerate(
+                        layer.mlp_.moes_[config.adapter_name].profiler_
+                    ):
+                        router_statistic_[idx] += val
+                    layer.mlp_.moes_[config.adapter_name].profiler_ = None
+                result["router_profile"] = list(val / 32 for val in router_statistic_)
+        results.append(result)
+        if save_file is not None:
+            if not os.path.exists(save_file):
+                os.makedirs(save_file)
+            file_path = save_file + os.sep + f"{config.adapter_name}.json"
+            with open(file_path, "w") as f:
+                json.dump(results, f, indent=4)
+            logging.info(f"saving evaluation result to {file_path}")
+        else:
+            print(json.dumps(results, indent=4))
+    return results
+def _dispatch_task_in2(
+    tokenizer,
+    configs: List[GenerateConfig],# config.data_, config.batch_size, config, config.adapter_name
+    concurrent_jobs: int,
+    strategy: str = "fair",
+):
+    assert strategy in ["fair", "fifo"], f"Unknown dispatch strategy {strategy}"
+    current_jobs = []
+    batch_config = []
+    input_tokens = []
+    max_tokens_len = 0
+    min_tokens_len = sys.maxsize
+    for config in configs:
+        if len(batch_config) >= concurrent_jobs:
+            break
+        if len(config.data_) == 0:
+            continue
+        print(f"count down:{len(config.data_)}")
+        if strategy == "fair":
+            per_task_jobs = max(concurrent_jobs // len(configs), 1)
+        else:
+            per_task_jobs = concurrent_jobs
+        per_task_jobs = min(per_task_jobs, config.batch_size)
+        batch_start_idx = len(input_tokens)
+        while per_task_jobs > 0 and len(config.data_) > 0:
+            per_task_jobs = per_task_jobs - 1
+            data = config.data_.pop(0)
+            current_jobs.append(data)
+            tokens = data.tokens
+            max_tokens_len = max(len(tokens), max_tokens_len)
+            min_tokens_len = min(len(tokens), min_tokens_len)
+            input_tokens.append(tokens)
+        batch_config.append(
+            LLMBatchConfig(
+                adapter_name_=config.adapter_name,
+                batch_start_idx_=batch_start_idx,
+                batch_end_idx_=len(input_tokens),
+            )
+        )
+    return (
+        current_jobs,
+        batch_config,
+        input_tokens,
+        max_tokens_len,
+        min_tokens_len,
+    )
+def _generate_then_compute_metrics(
+        model, tokenizer, concurrent_jobs, \
+        max_gen_len, current_configs: List[EvaluateConfig],\
+        require_attention: Optional[int] = -1, require_hide: Optional[int] = -1
+        ):
+    # grounds 是qa_pair
+    metric = current_configs[0].metric_.metric_
+    ###outputs, hidden_output, hidden_atten = model.forward(input_args)
+    #!!! 在这把current_configs转化为GenerateConfig。现在是EvaluateConfig
+    #cnt = 50
+    #cases = []
+    while True:# configs里的data在变，是调度的唯一指标
+        dispatch_args = _dispatch_task_in2(tokenizer, current_configs, concurrent_jobs)
+        # 包含：current_jobs, batch_config(LLMBatchConfig(taskname,start,end)),
+        # batch_tokens, max_lenth, min_length
+        if len(dispatch_args[0]) == 0:
+            break
+        use_cache = True
+        cache_implementation = model.model_.cache_implementation()
+        if cache_implementation is None:
+            logging.warn(
+                "Cache disabled by model, use cache_implementation to force enable."
+            )
+            use_cache = False
+        outputs, running_jobs = _batch_generate(
+            model,
+            tokenizer,
+            max_gen_len,
+            use_cache,
+            require_attention,
+            require_hide,
+            cache_implementation,
+            None,
+            *dispatch_args,
+        )
+        for data in running_jobs:
+            current_configs[0].data_.append(data)
+        print(f"\noutput:{outputs[0]}\n")
+        metric.add_batch(
+            {
+                'output': outputs[0],
+                'qa_pairs': dispatch_args[0][0].grounds,
+                'answer': dispatch_args[0][0].labels,
+                'docs': dispatch_args[0][0].citations,
+                'query': dispatch_args[0][0].query,
+            }
+        )
+@torch.inference_mode()
+def evaluate(
+    model: LLMModel,
+    tokenizer: Tokenizer,
+    configs: List[EvaluateConfig],
+    max_concurrent_jobs: int = None,
+    retrying_steps: int = 20,
+    max_seq_len: int = 512,
+    save_file: str = None,
+    require_attention: Optional[int] = -1,
+    require_hide: Optional[int] = -1,
+) -> Dict:
+    if max_concurrent_jobs is None:
+        max_concurrent_jobs = len(configs)
+        logging.info(
+            f"Setting max_concurrent_jobs to {max_concurrent_jobs} automatically"
+        )
+    assert max_concurrent_jobs > 0
+    assert retrying_steps > 0
+    _prepare_tasks(model, tokenizer, configs)
+    concurrent_jobs = max_concurrent_jobs
+    retrying_count = 0
+    while True:
+        if concurrent_jobs < max_concurrent_jobs and retrying_count > 0:
+            retrying_count -= 1
+            if retrying_count == 0:
+                concurrent_jobs += 1
+                logging.info(f"recovering concurrent jobs to {concurrent_jobs}")
+        current_configs, sequence_lengths, batch_labels, grounds, input_args = _dispatch_task_in(
+            tokenizer, configs, concurrent_jobs, max_seq_len
+        )
+        # current_configs(这个batch的configs)
+        # sequence_lengths(这个batch里的tokens的length)
+        # batch_labels、grounds
+        # input_args: LLMBatchConfig (batch_config(adapter_name, start，end)，/
+        # tokens， attention_mask)
+        if len(current_configs) == 0:
+            break
+        try:
+            if current_configs[0].task_.task_type_ == 'attribute':
+                _generate_then_compute_metrics(
+                    model,
+                    tokenizer,
+                    concurrent_jobs,
+                    max_seq_len,
+                    current_configs,
+                    require_attention,
+                    require_hide
+                )
+            else:
+                _compute_metrcis(
+                    model,
+                    current_configs,
+                    sequence_lengths,
+                    batch_labels,
+                    model.forward(input_args),
+                )
+        except RuntimeError as e:
+            if "out of memory" in str(e).lower():
+                concurrent_jobs -= 1
+                if concurrent_jobs == 0:
+                    raise e
+                logging.warn(
+                    f"deprecating concurrent jobs to {concurrent_jobs} due to OOM."
+                )
+                # rollback
+                retrying_count = retrying_steps
+                for config in current_configs:
+                    config.batch_start_idx_ = config.rollback_start_idx_
+                    logging.info(
+                        f"{config.adapter_name}: rollback to {config.batch_start_idx_}/{len(config.data_)}"
+                    )
+                continue
+            else:
+                raise e
+        for config in current_configs:
+            config.rollback_start_idx_ = config.batch_start_idx_
+    return _compute_result(model, configs, save_file)

c2cite/executors/__init__.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import gc
+import os
+import torch
+from .common import BasicExecutor
+from .cpu import CPUExecutor
+from .cuda import CUDAExecutor
+from .mps import MPSExecutor
+executor_dict = {
+    "CUDA": CUDAExecutor,
+    "MPS": MPSExecutor,
+    "CPU": CPUExecutor,
+}
+def _init_executor():
+    env = os.getenv("MOE_PEFT_EXECUTOR_TYPE")
+    if env is not None:
+        env = env.upper()
+        if env not in executor_dict:
+            raise ValueError(f"Assigning unknown executor type {env}")
+        return executor_dict[env]()
+    elif torch.cuda.is_available():
+        return CUDAExecutor()
+    elif torch.backends.mps.is_available():
+        return MPSExecutor()
+    else:
+        return CPUExecutor()
+executor: BasicExecutor = _init_executor()
+class no_cache(object):
+    def __enter__(self):
+        executor.empty_cache()
+        gc.collect()
+        return self
+    def __exit__(self, type, value, traceback):
+        executor.empty_cache()
+        gc.collect()
+__all__ = [
+    "BasicExecutor",
+    "CUDAExecutor",
+    "MPSExecutor",
+    "CPUExecutor",
+    "executor",
+    "no_cache",
+]

c2cite/executors/common.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import logging
+import random
+import torch
+from transformers.utils import is_torch_bf16_available_on_device
+from moe_peft.utils import NoneContexts
+class BasicExecutor:
+    def name(self) -> str:
+        raise NotImplementedError()
+    def device_name(self) -> str:
+        raise NotImplementedError()
+    def default_device_name(self) -> str:
+        return self.device_name()
+    def is_available(self) -> bool:
+        raise NotImplementedError()
+    def is_initialized(self) -> bool:
+        raise NotImplementedError()
+    def is_bf16_supported(self) -> bool:
+        return is_torch_bf16_available_on_device(self.device_name())
+    def manual_seed(self, seed: int):
+        random.seed(seed)
+        torch.manual_seed(seed)
+    def empty_cache(self):
+        raise NotImplementedError()
+    def use_deterministic_algorithms(self, mode: bool):
+        torch.use_deterministic_algorithms(mode)
+    def allow_tf32(self, mode: bool):
+        raise NotImplementedError()
+    def set_rng_state(self, device, state):
+        raise NotImplementedError()
+    def get_rng_state(self, device):
+        raise NotImplementedError()
+    def fork_rng(self, rng_devices: list):
+        return torch.random.fork_rng(
+            devices=rng_devices, device_type=self.device_name()
+        )
+    def autocast(self, **kwargs):
+        return NoneContexts()
+    def init_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
+        return torch.empty_like(tensor)
+    def index_fill(
+        self, input: torch.Tensor, dim: int, index: torch.Tensor, value: torch.Tensor
+    ):
+        input.index_fill_(dim, index, value)
+    def index_copy(
+        self, input: torch.Tensor, dim: int, index: torch.Tensor, source: torch.Tensor
+    ):
+        input.index_copy_(dim, index, source)
+    def check_available(self):
+        if not self.is_available():
+            logging.error(f"{self.name()} not available.")
+            return False
+        if not self.is_initialized():
+            logging.error(f"{self.name()} not initialized.")
+            return False
+        logging.info(f"{self.name()} initialized successfully.")
+        return True

c2cite/executors/cpu.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import contextlib
+import logging
+import torch
+from .common import BasicExecutor
+class CPUExecutor(BasicExecutor):
+    def __init__(self) -> None:
+        super().__init__()
+    def name(self) -> str:
+        return "CPU"
+    def device_name(self) -> str:
+        return "cpu"
+    def is_available(self) -> bool:
+        return True
+    def is_initialized(self) -> bool:
+        return False
+    def empty_cache(self):
+        pass
+    def allow_tf32(self, mode: bool):
+        assert not mode, "Enabling tf32 for CPU."
+    def set_rng_state(self, device: int, state: torch.Tensor):
+        assert device == 0
+        torch.set_rng_state(state)
+    def get_rng_state(self, device: int):
+        assert device == 0
+        return torch.get_rng_state()
+    @contextlib.contextmanager
+    def fork_rng(self, rng_devices: list):
+        # TODO: change to official implementation
+        assert len(rng_devices) == 0
+        cpu_rng_state = torch.get_rng_state()
+        try:
+            yield
+        finally:
+            torch.set_rng_state(cpu_rng_state)
+    def check_available(self):
+        logging.info(f"{self.name()} initialized successfully.")
+        return True

c2cite/executors/cuda.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from .common import BasicExecutor
+class CUDAExecutor(BasicExecutor):
+    def __init__(self) -> None:
+        super().__init__()
+        torch.cuda.init()
+    def name(self) -> str:
+        return "NVIDIA CUDA"
+    def device_name(self) -> str:
+        return "cuda"
+    def default_device_name(self) -> str:
+        return "cuda:0"
+    def is_available(self) -> bool:
+        return torch.cuda.is_available()
+    def is_initialized(self) -> bool:
+        return torch.cuda.is_initialized()
+    def is_bf16_supported(self) -> bool:
+        return torch.cuda.is_bf16_supported()
+    def manual_seed(self, seed: int):
+        super().manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+    def empty_cache(self):
+        torch.cuda.empty_cache()
+    def use_deterministic_algorithms(self, mode: bool):
+        torch.backends.cudnn.benchmark = not mode
+        torch.backends.cudnn.deterministic = mode
+    def allow_tf32(self, mode: bool):
+        torch.backends.cudnn.allow_tf32 = mode
+        torch.backends.cuda.matmul.allow_tf32 = mode
+    def set_rng_state(self, device, state):
+        with torch.cuda.device(device):
+            return torch.cuda.set_rng_state(state)
+    def get_rng_state(self, device):
+        with torch.cuda.device(device):
+            return torch.cuda.get_rng_state()
+    def autocast(self, **kwargs):
+        return torch.cuda.amp.autocast(**kwargs)

c2cite/executors/mps.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import contextlib
+import torch
+from .common import BasicExecutor
+class MPSExecutor(BasicExecutor):
+    def __init__(self) -> None:
+        super().__init__()
+    def name(self) -> str:
+        return "APPLE MPS"
+    def device_name(self) -> str:
+        return "mps"
+    def is_available(self) -> bool:
+        return torch.backends.mps.is_available() and torch.backends.mps.is_built()
+    def is_initialized(self) -> bool:
+        # TODO: change to official implementation
+        return not torch.mps._is_in_bad_fork()
+    def manual_seed(self, seed: int):
+        super().manual_seed(seed)
+        torch.mps.manual_seed(seed)
+    def empty_cache(self):
+        torch.mps.empty_cache()
+    def allow_tf32(self, mode: bool):
+        assert not mode, "Enabling tf32 for MPS devices."
+    def set_rng_state(self, device: int, state: torch.Tensor):
+        assert device == 0
+        return torch.mps.set_rng_state(state)
+    def get_rng_state(self, device: int):
+        assert device == 0
+        return torch.mps.get_rng_state()
+    @contextlib.contextmanager
+    def fork_rng(self, rng_devices: list):
+        # TODO: change to official implementation
+        assert len(rng_devices) == 1 and rng_devices[0] == 0
+        cpu_rng_state = torch.get_rng_state()
+        device_rng_states = torch.mps.get_rng_state()
+        try:
+            yield
+        finally:
+            torch.set_rng_state(cpu_rng_state)
+            torch.mps.set_rng_state(device_rng_states)
+    def autocast(self, **kwargs):
+        # TODO: change to official implementation
+        # running with compatible mode
+        return torch.cuda.amp.autocast(**kwargs)
+    def init_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
+        return torch.zeros_like(tensor)
+    def index_fill(
+        self, input: torch.Tensor, dim: int, index: torch.Tensor, value: torch.Tensor
+    ):
+        pass
+    def index_copy(
+        self, input: torch.Tensor, dim: int, index: torch.Tensor, source: torch.Tensor
+    ):
+        input.index_add_(dim, index, source)

c2cite/generator.py ADDED Viewed

	@@ -0,0 +1,669 @@

+import logging
+import sys
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import torch
+import re
+import matplotlib.pyplot as plt
+from moe_peft.common import LLMBatchConfig, LLMModelInput, Tokens, cache_factory
+from moe_peft.executors import executor
+from moe_peft.model import LLMModel
+from moe_peft.prompter import Prompter
+from moe_peft.tokenizer import Tokenizer
+from moe_peft.solutions import get_output
+@dataclass
+class GenerateData:
+    adapter_name_: str = None
+    prompt_index_: int = None
+    prefix_length_: int = None
+    raw_tokens_: Tokens = None
+@dataclass
+class GenerateConfig:
+    adapter_name: str = None
+    prompts: List[Union[str, Tuple[str, str]]] = None
+    prompt_template: str = None
+    # Generate Arguments
+    batch_size: int = 8
+    stop_token: str = None
+    temperature: float = 1
+    top_p: float = 0.9
+    top_k: float = 50
+    do_sample: bool = True
+    repetition_penalty: float = 1.1
+    renormalize_logits: bool = True
+    # Do not set these manually
+    prompter_: Prompter = None
+    stop_token_: torch.Tensor = None
+    data_: List[GenerateData] = None
+    # Set prompt_template_ to enable the prompter
+    def generate_prompt(self, instruction: str, input: str = None) -> str:
+        if self.prompter_ is None:
+            self.prompter_ = Prompter(self.prompt_template)
+        return self.prompter_.generate_prompt(instruction=instruction, input=input)
+    def get_prompts(self) -> List[str]:
+        prompts = []
+        for prompt in self.prompts:
+            args = prompt if isinstance(prompt, Tuple) else (prompt, None)
+            prompts.append(self.generate_prompt(*args))
+        return prompts
+    def get_response(self, output: str) -> str:
+        if self.prompter_ is None:
+            return output.strip()
+        else:
+            return self.prompter_.get_response(output)
+    def reset_parameters(self):
+        self.prompter_ = Prompter(self.prompt_template)
+        self.stop_token_ = None
+        self.data_ = []
+def _logits_sample_top_p(probs, p, filter_value=float("-inf"), min_tokens_to_keep=1):
+    sorted_logits, sorted_indices = torch.sort(probs, descending=False)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+    sorted_indices_to_remove = cumulative_probs <= (1 - p)
+    sorted_indices_to_remove[..., -min_tokens_to_keep:] = 0
+    indices_to_remove = sorted_indices_to_remove.scatter(
+        1, sorted_indices, sorted_indices_to_remove
+    )
+    return probs.masked_fill(indices_to_remove, filter_value)
+def _logits_sample_top_k(probs, k, filter_value=float("-inf")):
+    top_k = min(k, probs.size(-1))  # Safety check
+    indices_to_remove = probs < torch.topk(probs, top_k)[0][..., -1, None]
+    return probs.masked_fill(indices_to_remove, filter_value)
+def _logits_repetition_penalty(prev_tokens, probs, penalty):
+    score = torch.gather(probs, 1, prev_tokens)
+    score = torch.where(score < 0, score * penalty, score / penalty)
+    probs.scatter_(1, prev_tokens, score)
+    return probs
+def id2token(x):
+    if x == 0:
+        return 128002
+    elif x == 1:
+        return 128003
+    elif x == 2:
+        return 128004
+    elif x == 3:
+        return 128005
+    elif x == 4:
+        return 128008
+    elif x >= 5:
+        return 128005 + x
+    else:
+        assert False, "wrong router"
+def logits_process(
+    probs: torch.Tensor,
+    prev_tokens: torch.Tensor,
+    cite_flag = False,
+    temperature=0.9,
+    top_p=0,
+    top_k=0,
+    do_sample=True,
+    repetition_penalty=1.01,
+    renormalize_logits=True,
+):
+    if cite_flag == False:
+        process_conditions = any([repetition_penalty > 0])
+        sample_conditions = any([temperature > 0, top_p > 0 and top_p <= 1.0, top_k > 0])
+        if not do_sample and sample_conditions:
+            do_sample = True
+            logging.warn("do_sample force to enabled.")
+        if repetition_penalty > 0:
+            probs = _logits_repetition_penalty(prev_tokens, probs, repetition_penalty)
+        if process_conditions and renormalize_logits:
+            probs = probs.log_softmax(-1)
+        if temperature > 0:
+            probs = probs / temperature
+        if top_k > 0:
+            probs = _logits_sample_top_k(probs, top_k)
+        if top_p > 0 and top_p <= 1.0:
+            probs = _logits_sample_top_p(probs, top_p)
+        if sample_conditions and renormalize_logits:
+            probs = probs.log_softmax(-1)
+    else:
+        do_sample = False
+    if do_sample:
+        probs = torch.softmax(probs, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
+    else:
+        next_token = torch.argmax(probs, dim=-1)
+    if cite_flag:
+        for i in range(probs.shape[0]):
+            next_token[i] = id2token(next_token[i] + 1)
+    return next_token.reshape(-1)
+def _extract_effective_tokens(
+    tokenizer: Tokenizer,
+    prefix_length: int,
+    tokens: Tokens,
+    remove_prefix=True,
+    remove_pad=True,
+    remove_eos=True,
+):
+    if remove_prefix:
+        tokens = tokens[prefix_length:]
+    if remove_pad and tokenizer.pad_id_ in tokens:
+        pad_idx = tokens.index(tokenizer.pad_id_)
+        tokens = tokens[:pad_idx]
+    if remove_eos and tokenizer.eos_id_ in tokens:
+        stop_idx = tokens.index(tokenizer.eos_id_)
+        tokens = tokens[:stop_idx]
+    return tokens
+def _gen_outputs(
+    tokenizer: Tokenizer,
+    config_dict: Dict[str, GenerateConfig],
+    current_jobs: List[GenerateData],
+    tokens: torch.Tensor,
+):
+    tokens = tokens.tolist()
+    packed_outputs: Dict[str, List[str]] = {}
+    for idx, data in enumerate(current_jobs):
+        output = config_dict[data.adapter_name_].get_response(
+            tokenizer.decode(
+                _extract_effective_tokens(
+                    tokenizer,
+                    data.prefix_length_,
+                    tokens[idx],
+                    remove_prefix=True,
+                    remove_pad=True,
+                    remove_eos=True,
+                )
+            )
+        )
+        if data.adapter_name_ in packed_outputs:
+            packed_outputs[data.adapter_name_].append(output)
+        else:
+            packed_outputs[data.adapter_name_] = [output]
+    return packed_outputs
+def _dispatch_task_in(
+    configs: List[GenerateConfig],# config.data_, config.batch_size, config, config.adapter_name
+    concurrent_jobs: int,
+    strategy: str = "fair",
+):
+    assert strategy in ["fair", "fifo"], f"Unknown dispatch strategy {strategy}"
+    current_jobs = []
+    batch_config = []
+    input_tokens = []
+    max_tokens_len = 0
+    min_tokens_len = sys.maxsize
+    for config in configs:
+        if len(batch_config) >= concurrent_jobs:
+            break
+        if len(config.data_) == 0:
+            continue
+        if strategy == "fair":
+            per_task_jobs = max(concurrent_jobs // len(configs), 1)
+        else:
+            per_task_jobs = concurrent_jobs
+        per_task_jobs = min(per_task_jobs, config.batch_size)
+        batch_start_idx = len(input_tokens)
+        while per_task_jobs > 0 and len(config.data_) > 0:
+            per_task_jobs = per_task_jobs - 1
+            data = config.data_.pop(0)
+            current_jobs.append(data)
+            tokens = data.raw_tokens_
+            max_tokens_len = max(len(tokens), max_tokens_len)
+            min_tokens_len = min(len(tokens), min_tokens_len)
+            input_tokens.append(tokens)
+        batch_config.append(
+            LLMBatchConfig(
+                adapter_name_=config.adapter_name,
+                batch_start_idx_=batch_start_idx,
+                batch_end_idx_=len(input_tokens),
+            )
+        )
+    return (
+        current_jobs,
+        batch_config,
+        input_tokens,
+        max_tokens_len,
+        min_tokens_len,
+    )
+def _dispatch_task_out(
+    tokenizer: Tokenizer,
+    # config_dict: Dict[str, GenerateConfig],
+    current_jobs: List[GenerateData],
+    tokens: torch.Tensor,
+    stop_reached: torch.Tensor,
+    attentions,
+    hides,
+    require_attention,
+    require_hide
+):
+    """hide = []
+    if require_hide != -1:
+        ans_len = len(hides)
+        for i in range(len(hides[0])):
+            hide.append(torch.cat([t[i] for t in hides], dim = 1))
+    if require_attention != -1:
+        ans_len = len(attentions)
+        for i in range(len(hides[0])):
+            hide.append(torch.cat([t[i] for t in attentions], dim = 1))"""
+    tokens = tokens.tolist()
+    stop_reached = stop_reached.view(-1).tolist()
+    packed_outputs: List[str] = []
+    packed_add = []
+    running_jobs: List[GenerateData] = []
+    for idx, data in enumerate(current_jobs): # 这里的data是evaluate data, 但是应该是generate data
+        if stop_reached[idx]:
+            output_tokens = _extract_effective_tokens(
+                        tokenizer,
+                        data.prefix_length_,
+                        tokens[idx],
+                        remove_prefix=True,
+                        remove_pad=True,
+                        remove_eos=True,
+                    )
+            #if len(hide):
+            #    get_output(hide, output_tokens, ans_len)
+            output_s = tokenizer.decode(output_tokens).strip()
+            output = re.sub(r'<\|reserved_special_token_(\d+)\|>', r'[\1]', output_s)
+            packed_outputs.append(output)
+        else:
+            data.tokens = _extract_effective_tokens(
+                tokenizer,
+                data.prefix_length_,
+                tokens[idx],
+                remove_prefix=False,
+                remove_pad=True,
+                remove_eos=False,
+            )
+            running_jobs.append(data)
+    return packed_outputs, running_jobs
+def _batch_generate(
+    model: LLMModel,
+    tokenizer: Tokenizer,
+    max_gen_len: Optional[int],
+    use_cache: bool,
+    require_attention: Optional[int],
+    require_hide: Optional[int],
+    cache_implementation: Optional[str],
+    stream_callback: Optional[Callable],
+    #config_dict: Dict[str, GenerateConfig],
+    current_jobs: List[GenerateData],
+    batch_config: List[LLMBatchConfig],
+    input_tokens: List[Tokens],
+    max_tokens_len: int,
+    min_tokens_len: int,
+):
+    executor.empty_cache()
+    device = torch.device(model.device_)
+    batch_size = len(input_tokens)
+    if max_gen_len is None:
+        max_gen_len = model.config_.max_seq_len_ - max_tokens_len
+    total_len = min(model.config_.max_seq_len_, max_gen_len + max_tokens_len)
+    past_key_values = (
+        cache_factory(
+            cache_implementation=cache_implementation,
+            config=model.model_.model_config(),
+            batch_size=batch_size,
+            max_cache_len=total_len,
+        )
+        if cache_implementation is not None
+        else None
+    )
+    tokens = torch.full(
+        (batch_size, total_len), tokenizer.pad_id_, dtype=torch.int64, device=device
+    )
+    # print(f"yyyyyy:\n{tokenizer.decode(input_tokens[0])}")
+    for k, t in enumerate(input_tokens):
+        tokens[k, : len(t)] = torch.tensor(t, dtype=torch.int64, device=device)
+    def condition(i):
+        return (128010 <= i <= 128255) or i in {128005, 128004, 128003, 128002, 128008}
+    prompt_len = len(input_tokens[0])
+    cite = [index for index, value in enumerate(input_tokens[0]) if condition(value)]
+    cite_v = [value for value in input_tokens[0] if condition(value)]
+    prev_pos = 0
+    stop_reached = torch.tensor([False] * batch_size, device=device)
+    input_text_mask = tokens != tokenizer.pad_id_
+    hidden_states = []
+    hidden_attentions = []
+    #arti_mask = torch.ones(batch_size, total_len, device=device, dtype=torch.int64)
+    cite_start = -1
+    #flag = -1
+    plac = []
+    for cur_pos in range(min_tokens_len, total_len):
+        input_data = LLMModelInput(
+            batch_configs_=batch_config,
+            batch_tokens_=tokens[:, prev_pos:cur_pos].tolist(),
+            #batch_masks_ = arti_mask,############
+            batch_cites = [cite],
+            batch_cites_value = [cite_v],
+            batch_docs = [current_jobs[0].citation_tokens],
+            batch_prompt_len = [prompt_len],
+            inference_mode_=True,
+        )
+        # print(f"fuck:\n{tokenizer.decode(tokens[0, prev_pos:cur_pos])}")
+        outputs = model.forward(input_data, past_key_values)
+        #hidden_states.append(hidden_state)
+        #hidden_attentions.append(hidden_attention)
+        #if flag != -1:
+            #输出attention
+        for output in outputs:
+            #config = config_dict[output.adapter_name]
+            start_idx = output.batch_start_idx_
+            end_idx = output.batch_end_idx_
+            next_token = logits_process(
+                output.logits[:, -1],#####看看它的维度,这里是乘完doc的，应该是logits
+                tokens[start_idx:end_idx, :cur_pos],
+                cite_flag = output.cite_flag,
+            )
+            next_token = torch.where(
+                input_text_mask[start_idx:end_idx, cur_pos],
+                tokens[start_idx:end_idx, cur_pos],
+                next_token,
+            ).to(torch.int64)
+            #print(tokenizer.decode(next_token))
+            if output.cite_flag == True:# 记得查看input_text_mask的形状
+                for i in range(start_idx, end_idx):
+                    if input_text_mask[i, cur_pos]:#纯废话，这时候考虑上多batch了
+                        continue
+                    cite.append(cur_pos)
+                    cite_v.append(next_token)
+            tokens[start_idx:end_idx, cur_pos] = next_token
+            stop_criteria = (~input_text_mask[start_idx:end_idx, cur_pos]) & (
+                next_token == torch.tensor(
+                [tokenizer.eos_id_], dtype=torch.int64, device=device
+                )
+            )
+            stop_reached[start_idx:end_idx] |= stop_criteria
+            if cite_start != -1:
+                if tokenizer.decode(next_token)[-1] in ['.','!','?']:
+                    #arti_mask[start_idx:end_idx, cite_start:cur_pos] = 0
+                    #tokens[start_idx:end_idx, cur_pos] = tokenizer.encode(tokenizer.decode(next_token)[-1])[-1]
+                    cite_start = -1
+                if tokenizer.decode(next_token)[-1] in ['0','1','2','3','4','5','6','7','8','9']:
+                    plac.append(cur_pos)
+                    # tokens[start_idx:end_idx, cur_pos] = (tokens[start_idx:end_idx, cur_pos] + 2)
+            if tokenizer.decode(next_token)[-1] == '[' or tokenizer.decode(next_token) == '[':
+                if cite_start == -1:
+                    cite_start = cur_pos
+                #flag = cur_pos
+        stop_reached |= total_len - cur_pos == 1
+        if any(stop_reached):
+            break
+        if use_cache:
+            prev_pos = cur_pos
+    """input_data = LLMModelInput(
+            batch_configs_=batch_config,
+            batch_tokens_=tokens[:,:hidden_attention.shape[0]].tolist(),
+            inference_mode_=True,
+    )"""
+    # print(f"fuck:\n{tokenizer.decode(tokens[0, prev_pos:cur_pos])}")
+    #outputs, _, attn = model.forward(input_data, None, require_attention, require_hide)
+    """for i in plac:
+        plt.figure(figsize=(hidden_attention.shape[0], 5), dpi = 50)
+        print("painting")
+        plt.bar(range(hidden_attention.shape[0]), attn[:,i].cpu().numpy())
+        plt.xticks(range(hidden_attention.shape[0]), [tokenizer.decode(j) for j in tokens[0][:hidden_attention.shape[0]]], fontsize = 8)
+        plt.savefig("high_res_heatmap.svg", dpi=50)
+        print("ok~")
+        input()
+    """
+    """attn[torch.arange(hidden_attention.shape[0]), torch.arange(hidden_attention.shape[0])] = 0.0
+    attn = torch.nn.functional.normalize(attn, p=2, dim=1)
+    attn = attn[min_tokens_len:hidden_attention.shape[0],min_tokens_len:hidden_attention.shape[0]]
+    plt.figure(figsize=(hidden_attention.shape[0] - min_tokens_len, hidden_attention.shape[0] - min_tokens_len))  # 调整图像大小
+    plt.imshow(attn.cpu().numpy(), cmap='viridis', vmin = 0, vmax = 0.1)
+    plt.colorbar(label='Value')
+    plt.xticks(range(hidden_attention.shape[0] - min_tokens_len), [tokenizer.decode(i) for i in tokens[0][min_tokens_len:hidden_attention.shape[0]]], fontsize = 10)
+    plt.yticks(range(hidden_attention.shape[0] - min_tokens_len), [tokenizer.decode(i) for i in tokens[0][min_tokens_len:hidden_attention.shape[0]]], fontsize = 10)
+    plt.savefig("high_res_heatmap.png", dpi=200)  # 保存为高分辨率图像
+    plt.show()
+    print("ok~")
+    input()"""
+    """token2 = tokens * arti_mask
+    lst = token2[0].tolist()
+    lst = [ele for ele in lst if ele != 0]
+    tokens = torch.tensor(lst, dtype=torch.int64, device=device).unsqueeze(0)"""
+    return _dispatch_task_out(
+        tokenizer, current_jobs, tokens, stop_reached, hidden_states, hidden_attentions, require_attention, require_hide
+    )
+def _batch_generate_original(
+    model: LLMModel,
+    tokenizer: Tokenizer,
+    max_gen_len: Optional[int],
+    use_cache: bool,
+    cache_implementation: Optional[str],
+    stream_callback: Optional[Callable],
+    config_dict: Dict[str, GenerateConfig],
+    current_jobs: List[GenerateData],
+    batch_config: List[LLMBatchConfig],
+    input_tokens: List[Tokens],
+    max_tokens_len: int,
+    min_tokens_len: int,
+):
+    executor.empty_cache()
+    device = torch.device(model.device_)
+    batch_size = len(input_tokens)
+    if max_gen_len is None:
+        max_gen_len = model.config_.max_seq_len_ - max_tokens_len
+    total_len = min(model.config_.max_seq_len_, max_gen_len + max_tokens_len)
+    past_key_values = (
+        cache_factory(
+            cache_implementation=cache_implementation,
+            config=model.model_.model_config(),
+            batch_size=batch_size,
+            max_cache_len=total_len,
+        )
+        if cache_implementation is not None
+        else None
+    )
+    tokens = torch.full(
+        (batch_size, total_len), tokenizer.pad_id_, dtype=torch.int64, device=device
+    )
+    for k, t in enumerate(input_tokens):
+        tokens[k, : len(t)] = torch.tensor(t, dtype=torch.int64, device=device)
+    prev_pos = 0
+    stop_reached = torch.tensor([False] * batch_size, device=device)
+    input_text_mask = tokens != tokenizer.pad_id_
+    for cur_pos in range(min_tokens_len, total_len):
+        input_data = LLMModelInput(
+            batch_configs_=batch_config,
+            batch_tokens_=tokens[:, prev_pos:cur_pos].tolist(),
+            inference_mode_=True,
+        )
+        outputs = model.forward(input_data, past_key_values)
+        for output in outputs:
+            config = config_dict[output.adapter_name]
+            start_idx = output.batch_start_idx_
+            end_idx = output.batch_end_idx_
+            next_token = logits_process(
+                output.logits[:, -1],
+                tokens[start_idx:end_idx, :cur_pos],
+                config.temperature,
+                config.top_p,
+                config.top_k,
+                config.do_sample,
+                config.repetition_penalty,
+                config.renormalize_logits,
+            )
+            next_token = torch.where(
+                input_text_mask[start_idx:end_idx, cur_pos],
+                tokens[start_idx:end_idx, cur_pos],
+                next_token,
+            ).to(torch.int64)
+            tokens[start_idx:end_idx, cur_pos] = next_token
+            stop_criteria = (~input_text_mask[start_idx:end_idx, cur_pos]) & (
+                next_token == config.stop_token_
+            )
+            stop_reached[start_idx:end_idx] |= stop_criteria
+        stop_reached |= total_len - cur_pos == 1
+        if any(stop_reached):
+            break
+        if stream_callback is not None:
+            stream_callback(
+                cur_pos,
+                _gen_outputs(
+                    tokenizer,
+                    config_dict,
+                    current_jobs,
+                    tokens,
+                ),
+            )
+        if use_cache:
+            prev_pos = cur_pos
+    return _dispatch_task_out(
+        tokenizer, config_dict, current_jobs, tokens, stop_reached
+    )
+@torch.inference_mode()
+def generate(
+    model: LLMModel,
+    tokenizer: Tokenizer,
+    configs: List[GenerateConfig],
+    max_gen_len: Optional[int] = None,
+    use_cache: bool = True,
+    dispatch_strategy: str = "fair",
+    concurrent_jobs: Optional[int] = None,
+    cache_implementation: Optional[str] = None,
+    stream_callback: Optional[Callable] = None,
+):
+    if concurrent_jobs is None:
+        concurrent_jobs = len(configs)
+        logging.info(f"Setting concurrent jobs to {concurrent_jobs} automatically")
+    assert concurrent_jobs > 0
+    # prepare for generation
+    device = torch.device(model.device_)
+    config_dict = {}
+    for config in configs:
+        config.reset_parameters()
+        config_dict[config.adapter_name] = config
+        if config.stop_token is not None:
+            stop_token = tokenizer.encode(" " + config.stop_token, False)[-1]
+        else:
+            stop_token = tokenizer.eos_id_
+        config.stop_token_ = torch.tensor(
+            [stop_token], dtype=torch.int64, device=device
+        )
+        for idx, prompt in enumerate(config.prompts):
+            args = prompt if isinstance(prompt, Tuple) else (prompt, None)
+            tokens = tokenizer.encode(config.generate_prompt(*args))
+            assert (
+                len(tokens) < model.config_.max_seq_len_
+            ), "Inputs exceeded max sequence length of model."
+            config.data_.append(
+                GenerateData(
+                    adapter_name_=config.adapter_name,
+                    prompt_index_=idx,
+                    prefix_length_=len(tokens),
+                    raw_tokens_=tokens,
+                )
+            )
+    if use_cache and cache_implementation is None:
+        cache_implementation = model.model_.cache_implementation()
+        if cache_implementation is None:
+            logging.warn(
+                "Cache disabled by model, use cache_implementation to force enable."
+            )
+            use_cache = False
+    packed_outputs: Dict[str, List] = {}
+    while True:# configs里的data在变，是调度的唯一指标
+        dispatch_args = _dispatch_task_in(configs, concurrent_jobs, dispatch_strategy)
+        # 包含：current_jobs, batch_config(LLMBatchConfig(taskname,start,end)),
+        # batch_tokens, max_lenth, min_length
+        if len(dispatch_args[0]) == 0:
+            break
+        outputs, running_jobs = _batch_generate(
+            model,
+            tokenizer,
+            max_gen_len,
+            use_cache,
+            cache_implementation,
+            stream_callback,
+            config_dict,
+            *dispatch_args,
+        )
+        for name, output in outputs.items():
+            if name in packed_outputs:
+                packed_outputs[name].extend(output)
+            else:
+                packed_outputs[name] = output
+        for data in running_jobs:
+            config_dict[data.adapter_name_].data_.append(data)
+    return packed_outputs

c2cite/model.py ADDED Viewed

	@@ -0,0 +1,1039 @@

+import copy
+import json
+import logging
+import math
+import os
+from typing import Dict, List, Optional, Tuple
+import torch.nn.functional as F
+import torch
+from huggingface_hub import snapshot_download
+from transformers import AutoModelForCausalLM
+from moe_peft.adapters import (
+    LoraMoeConfig,
+    MixLoraConfig,
+    MolaConfig,
+    lora_config_factory,
+    moe_layer_factory,
+    router_loss_factory,
+)
+from moe_peft.common import (
+    CHECKPOINT_CLASSES,
+    AdapterConfig,
+    Linear,
+    LLMCache,
+    LLMDecoder,
+    LLMForCausalLM,
+    LLMModelConfig,
+    LLMModelInput,
+    LLMModelOutput,
+    LLMMoeBlock,
+    LLMOutput,
+    LoraConfig,
+    unpack_router_logits,
+)
+from moe_peft.executors import executor
+from moe_peft.models import from_pretrained
+from moe_peft.tasks import SequenceClassificationTask, task_dict
+from moe_peft.utils import is_package_available
+if is_package_available("bitsandbytes"):
+    from transformers import BitsAndBytesConfig
+else:
+    from moe_peft.utils import BitsAndBytesConfig
+class CasualOutputLayer(LLMOutput):
+    def __init__(self, vocab_size: int, weight: torch.nn.Linear):
+        super().__init__()
+        self.vocab_size_: int = vocab_size
+        self.lm_head_: torch.nn.Module = weight
+    def forward(self, data: torch.Tensor) -> torch.Tensor:
+        return self.lm_head_(data).float()
+    def loss(
+        self, input_ids: torch.Tensor, output_logits: torch.Tensor, labels,
+        cites: Optional[List] = None, cites_v: Optional[List] = None, prompt_lens: Optional[List] = None
+    ) -> torch.Tensor:
+        if isinstance(labels, torch.Tensor):
+            labels = (
+                labels.clone()
+                .detach()
+                .to(dtype=torch.long, device=output_logits.device)
+            )
+        else:
+            labels = torch.tensor(labels, dtype=torch.long, device=output_logits.device)
+        loss_fn = torch.nn.CrossEntropyLoss()
+        if cites:
+            for i in range(len(labels)):
+                for j in range(len(cites_v[i])):
+                    labels[i][cites[i][j]] = -100
+            loss_fn = torch.nn.CrossEntropyLoss(ignore_index = -100)
+        """return loss_fn(
+            output_logits[..., :-1, :].contiguous().view(-1, self.vocab_size_),
+            labels[..., 1:].contiguous().view(-1),
+        )"""
+        ans = 0
+        for i in range(len(prompt_lens)):
+            ans += loss_fn(
+                output_logits[i, prompt_lens[i] - 1:-1, :].contiguous().view(-1, self.vocab_size_),
+                labels[i, prompt_lens[i]:].contiguous().view(-1),
+                )
+        return ans / len(prompt_lens)
+class ClassificationOutputLayer(LLMOutput):
+    def __init__(
+        self,
+        task_type: str,
+        num_labels: int,
+        label_dtype: torch.dtype,
+        hidden_size: int,
+        pad_token_id: int,
+        device: str,
+        weight: Optional[torch.Tensor],
+    ):
+        super().__init__()
+        self.label_dtype_ = label_dtype
+        self.num_labels_ = num_labels
+        self.task_type_ = task_type
+        self.pad_id_ = pad_token_id
+        self.score_ = torch.nn.Linear(
+            hidden_size,
+            self.num_labels_,
+            bias=False,
+            dtype=torch.float32,
+            device=device,
+        )
+        if weight is None:
+            torch.nn.init.kaiming_normal_(self.score_.weight, a=math.sqrt(5))
+        else:
+            with torch.no_grad():
+                self.score_.weight.copy_(weight["classifier"])
+    def state_dict(self):
+        return {"classifier": self.score_.weight}
+    def forward(self, data: torch.Tensor) -> torch.Tensor:
+        return self.score_(data.to(torch.float32))
+    def loss(
+        self, input_ids: torch.Tensor, output_logits: torch.Tensor, labels
+    ) -> torch.Tensor:
+        if isinstance(labels, torch.Tensor):
+            labels = (
+                labels.clone()
+                .detach()
+                .to(dtype=self.label_dtype_, device=output_logits.device)
+            )
+        else:
+            labels = torch.tensor(
+                labels, dtype=self.label_dtype_, device=output_logits.device
+            )
+        batch_size = input_ids.shape[0]
+        sequence_lengths = (torch.eq(input_ids, self.pad_id_).int().argmax(-1) - 1).to(
+            output_logits.device
+        )
+        pooled_logits = output_logits[
+            torch.arange(batch_size, device=output_logits.device), sequence_lengths
+        ]
+        if self.task_type_ == "single_label_classification":
+            loss_fn = torch.nn.CrossEntropyLoss()
+            return loss_fn(pooled_logits.view(-1, self.num_labels_), labels.view(-1))
+        elif self.task_type_ == "multi_label_classification":
+            loss_fn = torch.nn.BCEWithLogitsLoss()
+            return loss_fn(pooled_logits, labels)
+        else:
+            raise ValueError(f"unknown task type {self.task_type_}")
+class OutputLayer(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers_: Dict[str, torch.nn.Module] = {}
+    def forward(
+        self, data: torch.Tensor, input_args: LLMModelInput
+    ) -> List[LLMModelOutput]:
+        outputs = []
+        for lora_config in input_args.batch_configs_:
+            adapter_name = lora_config.adapter_name_
+            start_idx = lora_config.batch_start_idx_
+            end_idx = lora_config.batch_end_idx_
+            assert adapter_name != "" and adapter_name in self.layers_
+            layer = self.layers_[adapter_name]
+            outputs.append(
+                LLMModelOutput(
+                    adapter_name=adapter_name,
+                    logits=layer.forward(data[start_idx:end_idx]),
+                    loss_fn_=layer.loss,
+                )
+            )
+        return outputs
+def init_lora_layer_weight(
+    transformer_layer: LLMDecoder,
+    llm_config: LLMModelConfig,
+    lora_config: LoraConfig,
+    lora_weights: Optional[Dict[str, torch.Tensor]],
+):
+    target_modules = lora_config.target_modules_
+    attn_state_dict, mlp_state_dict = transformer_layer.state_dict()
+    attn_state_dict: Dict[str, torch.Tensor]
+    mlp_state_dict: Dict[str, torch.Tensor]
+    all_state_dict: Dict[str, torch.Tensor] = copy.copy(attn_state_dict)
+    all_state_dict.update(mlp_state_dict)
+    moe_init_strategy = "none"
+    if isinstance(lora_config, MixLoraConfig):
+        model_prefix_name = "mixlora"
+        moe_layer_name_list = list(mlp_state_dict.keys())
+        moe_init_strategy = "fused_mlp"
+    elif isinstance(lora_config, LoraMoeConfig):
+        model_prefix_name = "loramoe"
+        moe_layer_name_list = list(mlp_state_dict.keys())
+        moe_init_strategy = "plugin"
+    elif isinstance(lora_config, MolaConfig):
+        model_prefix_name = "mola"
+        moe_layer_name_list = list(all_state_dict.keys())
+        moe_init_strategy = "plugin"
+    else:
+        model_prefix_name = "base_model.model.model"
+        moe_layer_name_list = []
+    assert len(moe_layer_name_list) == 0 or moe_init_strategy in ["plugin", "fused_mlp"]
+    if moe_init_strategy == "fused_mlp":
+        transformer_layer.mlp_.moes_[lora_config.adapter_name] = moe_layer_factory(
+            llm_config.dim_,
+            llm_config.device_,
+            lora_config,
+            (
+                None
+                if lora_weights is None
+                else lora_weights[
+                    f"{model_prefix_name}.layers.{transformer_layer.layer_id_}.mlp.moe_gate.weight"
+                ]
+            ),
+        )
+    for proj_name, lora_linear in all_state_dict.items():
+        lora_linear: Linear
+        if proj_name not in target_modules or not target_modules[proj_name]:
+            continue
+        module_name = (
+            "self_attn"
+            if proj_name in attn_state_dict
+            else ("mlp" if proj_name in mlp_state_dict else None)
+        )
+        module_name = f"{model_prefix_name}.layers.{transformer_layer.layer_id_}.{module_name}.{proj_name}"
+        if proj_name in moe_layer_name_list:
+            if moe_init_strategy == "plugin":
+                # init for gating mechanisms
+                lora_linear.moes_[lora_config.adapter_name] = moe_layer_factory(
+                    lora_linear.in_features_,
+                    llm_config.device_,
+                    lora_config,
+                    (
+                        lora_weights.get(f"{module_name}.moe_gate.weight", None)
+                        if lora_weights is not None
+                        else None
+                    ),
+                )
+            for expert_idx in range(lora_config.num_experts_):
+                if lora_weights is None:
+                    lora_a = None
+                    lora_b = None
+                else:
+                    lora_a = lora_weights.get(
+                        f"{module_name}.experts.{expert_idx}.lora_A.weight", None
+                    )
+                    lora_b = lora_weights.get(
+                        f"{module_name}.experts.{expert_idx}.lora_B.weight", None
+                    )
+                lora_linear.init_lora_weight(
+                    lora_config.expert_config(expert_idx), (lora_a, lora_b)
+                )
+        else:
+            if lora_weights is None:
+                lora_a = None
+                lora_b = None
+            else:
+                lora_a = lora_weights.get(f"{module_name}.lora_A.weight", None)
+                lora_b = lora_weights.get(f"{module_name}.lora_B.weight", None)
+            lora_linear.init_lora_weight(lora_config, (lora_a, lora_b))
+def get_lora_layer_weight(
+    transformer_layer: LLMDecoder,
+    lora_config: LoraConfig,
+    lora_weights: Dict[str, torch.Tensor],
+):
+    target_modules = lora_config.target_modules_
+    attn_state_dict, mlp_state_dict = transformer_layer.state_dict()
+    attn_state_dict: Dict[str, torch.Tensor]
+    mlp_state_dict: Dict[str, torch.Tensor]
+    all_state_dict: Dict[str, torch.Tensor] = copy.copy(attn_state_dict)
+    all_state_dict.update(mlp_state_dict)
+    if isinstance(lora_config, MixLoraConfig):
+        model_prefix_name = "mixlora"
+        gate_layer_name = (
+            f"mixlora.layers.{transformer_layer.layer_id_}.mlp.moe_gate.weight"
+        )
+        moe_layer_name_list = list(mlp_state_dict.keys())
+    elif isinstance(lora_config, LoraMoeConfig):
+        model_prefix_name = "loramoe"
+        moe_layer_name_list = list(mlp_state_dict.keys())
+    elif isinstance(lora_config, MolaConfig):
+        model_prefix_name = "mola"
+        moe_layer_name_list = list(all_state_dict.keys())
+    else:
+        model_prefix_name = "base_model.model.model"
+        moe_layer_name_list = []
+    # for fused MoEs such as MixLoRA
+    mlp_moe_layer: LLMMoeBlock = transformer_layer.mlp_.moes_.get(
+        lora_config.adapter_name, None
+    )
+    if mlp_moe_layer is not None:
+        lora_weights[gate_layer_name] = mlp_moe_layer.gate_.weight
+    for proj_name, lora_linear in all_state_dict.items():
+        lora_linear: Linear
+        if proj_name not in target_modules or not target_modules[proj_name]:
+            continue
+        module_name = (
+            "self_attn"
+            if proj_name in attn_state_dict
+            else ("mlp" if proj_name in mlp_state_dict else None)
+        )
+        module_name = f"{model_prefix_name}.layers.{transformer_layer.layer_id_}.{module_name}.{proj_name}"
+        if proj_name in moe_layer_name_list:
+            moe_layer = (
+                lora_linear.moes_[lora_config.adapter_name]
+                if lora_config.adapter_name in lora_linear.moes_
+                else mlp_moe_layer
+            )
+            # for plugged MoEs such as LoRAMoE, MoLA, etc.
+            if lora_config.adapter_name in lora_linear.moes_:
+                lora_weights[f"{module_name}.moe_gate.weight"] = lora_linear.moes_[
+                    lora_config.adapter_name
+                ].gate_.weight
+            for expert_idx in range(moe_layer.experts_):
+                moe_lora_name = f"moe.{lora_config.adapter_name}.experts.{expert_idx}"
+                lora_obj = lora_linear.loras_.get(moe_lora_name, None)
+                if lora_obj is not None:
+                    lora_weights[
+                        f"{module_name}.experts.{expert_idx}.lora_A.weight"
+                    ] = lora_obj.lora_a_.weight
+                    lora_weights[
+                        f"{module_name}.experts.{expert_idx}.lora_B.weight"
+                    ] = lora_obj.lora_b_.weight
+        else:
+            lora_obj = lora_linear.loras_.get(lora_config.adapter_name, None)
+            if lora_obj is not None:
+                lora_weights[f"{module_name}.lora_A.weight"] = lora_obj.lora_a_.weight
+                lora_weights[f"{module_name}.lora_B.weight"] = lora_obj.lora_b_.weight
+def get_atten_tar(x, y, device, dtype):
+    si = torch.arange(0, y, device=device, dtype = dtype)
+    xi = torch.arange(1, x, device=device, dtype = dtype)#1~19
+    lamb = torch.tensor(-2, device=device, dtype= dtype)
+    alpha = (1 - torch.exp(-(si / 200))).detach()
+    base = torch.empty(x-1, device=device, dtype= dtype)#(19)
+    #base[0] = torch.log(torch.tensor(x, device=device, dtype = dtype)-1)
+    base[0] = torch.exp(lamb)
+    for i in range(1, x-1):
+        #base[i] = base[i - 1] + torch.log(torch.tensor(x-i-1, device=device, dtype = dtype))
+        base[i] = base[i - 1] + torch.exp(lamb * (i + 1))
+    award = (0.1 * (0.5 - 1 / (xi + 1)) + 0.2).detach()
+    #beta = (torch.log(x - xi) * award).expand(xi.shape[0], x-1).T
+    beta = (torch.exp(lamb * xi) * award).expand(xi.shape[0], x-1).T
+    beta = (beta / base).detach()
+    return alpha, beta # alpha是从0开始的，beta[0]是1。至少321长度时，beta至少得0.8
+class LLMModel(torch.nn.Module):
+    def __init__(self, model: LLMForCausalLM):
+        super().__init__()
+        args: LLMModelConfig = model.config_
+        if args.vocab_size_ >= torch.finfo(args.dtype_).max:
+            logging.warn(
+                f"vocab_size >= max({args.dtype_}), consider load model with higher precision."
+            )
+        self.model_ = model
+        self.config_ = args
+        # configs
+        self.name_or_path_ = args.name_or_path_
+        self.vocab_size_ = args.vocab_size_
+        self.device_ = args.device_
+        self.dtype_ = args.dtype_
+        self.attention_weight = torch.nn.Parameter(torch.empty(
+            model.layers_[0].self_attn_.n_heads_,1,dtype=args.dtype_,device=args.device_,))
+        self.routerup = torch.nn.Parameter(torch.empty(
+            model.config_.dim_, 2,dtype=args.dtype_,device=args.device_,))
+        """self. routerdown = torch.nn.Parameter(torch.empty(
+            model.config_.dim_ * 2, 2,dtype=args.dtype_,device=args.device_,))"""
+        self.cite_output = torch.nn.Parameter(torch.empty(
+            model.config_.dim_,model.config_.dim_,dtype=args.dtype_,device=args.device_,))
+        self.doc_proj = torch.nn.Parameter(torch.empty(
+             model.config_.dim_, model.config_.dim_,dtype=args.dtype_,device=args.device_,))
+        self.alpha, self.beta= get_atten_tar(40, 3000, args.device_, args.dtype_)
+        self.silu = torch.nn.SiLU()
+        self.output_ = OutputLayer()
+        # adapter configs
+        self.adapter_configs_: Dict[str, LoraConfig] = {}
+    def token2id(self, t):
+        if isinstance(t, torch.Tensor):
+            x = t.item()
+        else:
+            x = t
+        if x == 128002:
+            return 0
+        elif x == 128003:
+            return 1
+        elif x == 128004:
+            return 2
+        elif x == 128005:
+            return 3
+        elif x == 128008:
+            return 4
+        elif x >= 128010 and x <= 128255:
+            return x - 128005
+        else:
+            return -1
+    def attention_target(self, i, j, T):
+        return self.alpha[j] * self.beta[T, i] * self.award[i]
+    def _prepare_inputs(
+        self, input_args: LLMModelInput, past_key_values: Optional[LLMCache] = None
+    ):
+        assert input_args.batch_tokens_ is not None, "Model have no input."
+        assert (
+            input_args.gradient_checkpoint_ == "none" or past_key_values is None
+        ), "Cache is incompatible with gradient checkpointing."
+        assert (
+            not input_args.inference_mode_ or input_args.gradient_checkpoint_ == "none"
+        ), "Can not use gradient checkpoint when inference."
+        # prepare inputs
+        if isinstance(input_args.batch_tokens_, torch.Tensor):
+            input_ids = input_args.batch_tokens_.to(
+                dtype=torch.int64, device=self.device_, requires_grad=False
+            )
+        else:
+            input_ids = torch.tensor(
+                input_args.batch_tokens_, dtype=torch.int64, device=self.device_, requires_grad=False
+            )
+        inputs_embeds = self.model_.embed_tokens(input_ids)
+        """if input_ids.shape[-1] > 1:
+            self.doc_embeds = []
+            cites = input_args.batch_cites
+            docs = input_args.batch_docs
+            for doc in docs:
+                doc = doc.clone().to(self.device_)
+                doc = doc @ self.doc_proj
+                self.doc_embeds.append(doc)
+            for i, cite in enumerate(cites):
+                for c in range(len(input_args.batch_cites_value[i])):
+                    inputs_embeds[i, cite[c]] = self.doc_embeds[i][self.token2id(input_args.batch_cites_value[i][c]) - 1].to(self.device_)
+        else:
+            fk = self.token2id(input_ids[0,0])
+            if fk != -1:
+                inputs_embeds[0][0] = self.doc_embeds[0][fk - 1].to(self.device_)"""
+        docs = input_args.batch_docs
+        if input_ids.shape[-1] > 1:
+            self.doc_embeds = []
+            cites = input_args.batch_cites
+            if not isinstance(docs[0][0], torch.Tensor):
+                for i in range(len(docs)):
+                    d = []
+                    for j in range(len(docs[i])):
+                        temp = self.model_.embed_tokens(torch.tensor(
+                            docs[i][j][1:], dtype=torch.int64, device=self.device_, requires_grad=False))
+                        temp = torch.mean(temp, dim = 0)
+                        d.append(temp)
+                    d = torch.stack(d)
+                    self.doc_embeds.append(d)
+            for i, cite in enumerate(cites):
+                for c in range(len(input_args.batch_cites_value[i])):
+                    doc_ind = self.token2id(input_args.batch_cites_value[i][c]) - 1
+                    assert doc_ind >= 0, print("fake cite token")
+                    inputs_embeds[i, cite[c]] = self.doc_embeds[i][doc_ind].to(self.device_)
+        else:
+            fk = self.token2id(input_ids[0,0]) - 1
+            if fk >= 0:
+                inputs_embeds[0][0] = self.doc_embeds[0][fk].to(self.device_)
+        if input_args.gradient_checkpoint_ != "none":
+            inputs_embeds.requires_grad_(True)
+        # prepare cache
+        past_seen_tokens = (
+            past_key_values.get_seq_length() if past_key_values is not None else 0
+        )
+        if past_seen_tokens is None:
+            past_seen_tokens = 0
+        cache_position = torch.arange(
+            past_seen_tokens,
+            past_seen_tokens + inputs_embeds.shape[1],
+            device=inputs_embeds.device,
+        )
+        # prepare mask
+        if input_args.batch_masks_ is not None:
+            # 2d mask is passed through the layers
+            if isinstance(input_args.batch_masks_, torch.Tensor):
+                attention_mask = input_args.batch_masks_.to(
+                    dtype=torch.int64, device=self.device_
+                )
+            else:
+                attention_mask = torch.tensor(
+                    input_args.batch_masks_, dtype=torch.int64, device=self.device_
+                )
+        else:
+            attention_mask = None
+        if self.config_.attn_implementation_ != "flash_attn":
+            causal_mask = self.model_.causal_mask(
+                attention_mask, inputs_embeds, cache_position, past_key_values
+            )
+        else:
+            causal_mask = attention_mask
+        return input_ids, inputs_embeds, attention_mask, causal_mask, cache_position
+    def _call_decoder_stack_original(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        # decoder layers
+        num_adapters = len(input_args.batch_configs_)
+        all_router_logits = [[] for _ in range(num_adapters)]
+        gradient_checkpoint = CHECKPOINT_CLASSES[input_args.gradient_checkpoint_]
+        for decoder_layer in self.model_.decoder_stack():
+            hidden_states, *router_logits = gradient_checkpoint(
+                decoder_layer.forward,
+                hidden_states,
+                input_args,
+                rotary_emb,
+                attention_mask,
+                cache_position,
+                past_key_value,
+            )
+            if len(router_logits) == 0:
+                continue
+            # collecting router logits
+            assert len(router_logits) == num_adapters
+            for idx in range(num_adapters):
+                if router_logits[idx] is not None:
+                    all_router_logits[idx].append(router_logits[idx])
+        hidden_states = self.model_.norm(hidden_states)
+        return hidden_states, all_router_logits
+    def _call_decoder_stack(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+        #require_attention: Optional[int] = -1,
+        #require_hide: Optional[int] = -1,
+    ):
+        # decoder layers
+        gradient_checkpoint = CHECKPOINT_CLASSES[input_args.gradient_checkpoint_]
+        #hidden_output = []
+        #hidden_atten = []
+        attention_matrixs = []
+        for idx, decoder_layer in enumerate(self.model_.decoder_stack()):
+            hidden_states, attention_matrix = gradient_checkpoint(
+                decoder_layer.forward,
+                hidden_states,
+                input_args,
+                rotary_emb,
+                attention_mask,
+                cache_position,
+                past_key_value,
+            )
+            if idx in [31,30,29]:
+                attention_matrixs.append(attention_matrix)
+            """if require_hide == len(self.model_.layers_) or require_hide == idx:
+                hidden_output.append(hidden_states)
+            if require_attention == len(self.model_.layers_) or require_attention == idx:
+                hidden_atten.append(hidden_attention)"""
+        hidden_states = self.model_.norm(hidden_states)
+        return hidden_states, attention_matrixs#hidden_atten, hidden_output
+    # compute the model: output probs
+    def forward(
+        self, input_args: LLMModelInput, past_key_values: Optional[LLMCache] = None
+    ) -> List[LLMModelOutput]:
+        input_ids, inputs_embeds, attention_mask, causal_mask, cache_position = (
+            self._prepare_inputs(input_args, past_key_values)
+        )
+        labels = input_args.batch_labels_
+        input_args.batch_labels_ = None
+        input_args.batch_tokens_ = None
+        input_args.batch_masks_ = None
+        # embed positions
+        hidden_states = inputs_embeds
+        rotary_emb = self.model_.rotary_embed(
+            hidden_states, cache_position.unsqueeze(0)
+        )
+        hidden_states, attention_matrixs = self._call_decoder_stack(
+            hidden_states,
+            input_args,
+            rotary_emb,
+            causal_mask,
+            cache_position,
+            past_key_values,
+            #require_attention,
+            #require_hide,
+        )
+        attention_matrixs[-1] = attention_matrixs[-1].permute(0,2,3,1)
+        attention_matrixs[-1] = torch.sum(attention_matrixs[-1], dim = -1).squeeze().to('cpu').detach()
+        #print(attention_matrixs[-1].shape)
+        #print(torch.mean(attention_matrixs[-1][input_args.batch_cites[0][0] + 1:input_args.batch_cites[0][2],input_args.batch_cites[0][0]]))
+        import numpy as np
+        import matplotlib.pyplot as plt
+        import seaborn as sns
+        plt.figure(figsize=(8, 6))
+        print(f"len:{input_args.batch_prompt_len[0]}")
+        print(attention_matrixs[-1].shape)
+        sns.heatmap(attention_matrixs[-1][input_args.batch_prompt_len[0]:,input_args.batch_prompt_len[0]:], annot=False, cmap="YlGnBu", vmin = 0, vmax = 0.2, xticklabels=False, yticklabels=False)
+        plt.savefig("/yy21/heatmap", bbox_inches='tight', dpi=300)
+        input()
+        #route_logits = hidden_states @ (self.routerup @ self.routerdown)
+        route_logits = hidden_states @ self.routerup
+        hidden_cites = hidden_states @ self.cite_output
+        norm_cite_logits = F.normalize(hidden_cites, p = 2, dim = 2)
+        cite_logits = []
+        for batch in range(hidden_states.shape[0]):
+            #norm_doc = F.normalize(self.doc_embeds[batch], p = 2, dim = 1)
+            norm_doc = F.normalize(self.doc_embeds[batch].detach(), p = 2, dim = 1)
+            cite_logits.append(norm_cite_logits[batch] @ norm_doc.T)
+            #cite_logits.append(norm_cite_logits[batch])
+        # calculate loss
+        output = self.output_(hidden_states, input_args)
+        #att_s = hidden_atten[0].sum(dim = 1).squeeze() / 32 ###这里把List变为一个值
+        assert isinstance(output, List)
+        for indx, lora_config in enumerate(input_args.batch_configs_):
+            output_data = output[indx]
+            assert isinstance(output_data, LLMModelOutput)
+            start_idx = lora_config.batch_start_idx_
+            end_idx = lora_config.batch_end_idx_
+            output_data.batch_start_idx_ = start_idx
+            output_data.batch_end_idx_ = end_idx
+            #print(f"router:{route_logits[0,-1]}")
+            #print(f"cite:{cite_logits}")
+            if (labels is None) and (route_logits[0, -1, 1] > route_logits[0, -1, 0]):
+                output_data.logits = cite_logits[0].unsqueeze(0)
+                #output_data.logits = hidden_states[0].unsqueeze(0)
+                output_data.cite_flag = True
+            else:
+                output_data.cite_flag = False
+            if labels is None:
+                continue
+            # compute loss when labels provided
+            output_data.loss = output_data.loss_fn_(
+                input_ids[start_idx:end_idx],
+                output_data.logits,
+                labels[start_idx:end_idx],
+                input_args.batch_cites,
+                input_args.batch_cites_value,
+                input_args.batch_prompt_len
+            )
+            output_data.loss_fn_ = None
+            # route_logits和下面的合并
+            for idx in range(len(input_args.batch_cites)):
+                new_cites = []
+                new_cites_v = []
+                for i in range(len(input_args.batch_cites[idx])):
+                    if input_args.batch_cites[idx][i] >= input_args.batch_prompt_len[idx]:
+                        new_cites.append(input_args.batch_cites[idx][i])
+                        if i < len(input_args.batch_cites_value[idx]):
+                            new_cites_v.append(input_args.batch_cites_value[idx][i])
+                input_args.batch_cites[idx] = new_cites
+                input_args.batch_cites_value[idx] = new_cites_v
+            if output_data.aux_loss is None:
+                output_data.aux_loss = self.attn_mat_coin * 0.01 * self.attention_loss_fn(attention_matrixs, causal_mask, input_args.batch_cites, input_args.batch_prompt_len)
+            else:
+                output_data.aux_loss += self.attn_mat_coin * 0.01 * self.attention_loss_fn(attention_matrixs, causal_mask, input_args.batch_cites, input_args.batch_prompt_len)
+            print(f"1:{output_data.aux_loss}")
+            for idx in range(len(input_args.batch_cites)):
+                if len(input_args.batch_cites[idx]) > len(input_args.batch_cites_value[idx]):
+                    input_args.batch_cites[idx] = input_args.batch_cites[idx][:-1]
+            output_data.aux_loss += self.router_coin * 10 * self.compute_route_loss(route_logits, input_args.batch_cites)#router的label中，cite位置的是1，其他是0
+            print(f"2:{output_data.aux_loss}")
+            #output_data.aux_loss += self.cite_coin * self.compute_cite_loss2(hidden_states, input_args.batch_cites,input_args.batch_cites_value,batch_doc_embed)#router的label中，cite位置的是1，其他是0
+            output_data.aux_loss += self.cite_coin * 100 * self.compute_cite_loss(cite_logits, input_args.batch_cites,input_args.batch_cites_value)#router的label中，cite位置的是1，其他是0
+            print(f"3:{output_data.aux_loss}")
+        return output
+    def from_pretrained(
+        name_or_path: str,
+        device: str,
+        bits: int = None,
+        attn_impl: str = "eager",
+        use_sliding_window: bool = False,
+        load_dtype: torch.dtype = torch.bfloat16,
+        compute_dtype: torch.dtype = torch.bfloat16,
+        double_quant: bool = True,
+        quant_type: str = "nf4",
+    ) -> "LLMModel":
+        # load_dtype will change the precision of LLaMA pre-trained model
+        # when loading with quantization (bits = 8 or bits = 4), load_dtype will only influence the actual computing precision
+        if load_dtype not in [torch.bfloat16, torch.float16, torch.float32]:
+            raise ValueError(f"unsupported load dtype {load_dtype}")
+        if compute_dtype not in [torch.bfloat16, torch.float16, torch.float32]:
+            raise ValueError(f"unsupported compute dtype {compute_dtype}")
+        if load_dtype in [torch.bfloat16, torch.float16]:
+            logging.info("Loading model with half precision.")
+        # BFloat16 is only supported after Ampere GPUs
+        if not executor.is_bf16_supported():
+            if load_dtype == torch.bfloat16:
+                logging.warning("bf16 is not available. deprecated to fp16.")
+                load_dtype = torch.float16
+            if bits in [4, 8] and compute_dtype == torch.bfloat16:
+                logging.warning("bf16 is not available. deprecated to fp16.")
+                compute_dtype = torch.float16
+        if bits in [4, 8]:
+            logging.info(f"Loading model with quantization, bits = {bits}.")
+            llm_model = AutoModelForCausalLM.from_pretrained(
+                name_or_path,
+                device_map=device,
+                trust_remote_code=True,
+                quantization_config=BitsAndBytesConfig(
+                    load_in_4bit=bits == 4,
+                    load_in_8bit=bits == 8,
+                    llm_int8_threshold=6.0,
+                    llm_int8_has_fp16_weight=False,
+                    bnb_4bit_compute_dtype=compute_dtype,
+                    bnb_4bit_use_double_quant=double_quant,
+                    bnb_4bit_quant_type=quant_type,
+                ),
+                torch_dtype=load_dtype,
+            )
+        else:
+            llm_model = AutoModelForCausalLM.from_pretrained(
+                name_or_path,
+                device_map=device,
+                trust_remote_code=True,
+                torch_dtype=load_dtype,
+            )
+        llm_model.requires_grad_(False)
+        model = from_pretrained(
+            llm_model,
+            attn_impl=attn_impl,
+            use_sliding_window=use_sliding_window,
+            device=device,
+        )
+        logging.info(f"Use {attn_impl} as attention implementation.")
+        return LLMModel(model)
+    def init_adapter(
+        self, config: AdapterConfig, weight: Optional[Dict[str, torch.Tensor]] = None
+    ):
+        self.attn_mat_coin = config.atten_coin
+        self.router_coin = config.router_coin
+        self.cite_coin = config.cite_coin
+        # Patch for MixLoRA
+        if isinstance(config, MixLoraConfig) and config.act_fn_ is None:
+            config.act_fn_ = self.config_.hidden_act_
+        self.adapter_configs_[config.adapter_name] = config
+        # init output layer
+        if config.task_name in task_dict and isinstance(
+            task_dict[config.task_name], SequenceClassificationTask
+        ):
+            output_layer = ClassificationOutputLayer(
+                **task_dict[config.task_name].init_kwargs(),
+                hidden_size=self.config_.dim_,
+                pad_token_id=self.config_.pad_token_id_,
+                device=self.device_,
+                weight=weight,
+            )
+        else:
+            output_layer = CasualOutputLayer(
+                vocab_size=self.config_.vocab_size_, weight=self.model_.lm_head_
+            )
+        if weight is None:
+            torch.nn.init.kaiming_normal_(self.attention_weight, mode='fan_in', nonlinearity='relu')
+            torch.nn.init.kaiming_normal_(self.routerup, mode='fan_in', nonlinearity='relu')
+            #torch.nn.init.kaiming_normal_(self.routerdown, mode='fan_in', nonlinearity='relu')
+            torch.nn.init.kaiming_normal_(self.cite_output, mode='fan_in', nonlinearity='relu')
+            torch.nn.init.orthogonal_(self.doc_proj)
+        else:
+            with torch.no_grad():
+                self.attention_weight.copy_(weight.get(f"{config.adapter_name}.attention_mat_weight", None))
+                self.routerup.copy_(weight.get(f"{config.adapter_name}.router_weight_up", None))
+                #self.routerdown.copy_(weight.get(f"{config.adapter_name}.router_weight_down", None))
+                self.cite_output.copy_(weight.get(f"{config.adapter_name}.cite_weight", None))
+                self.doc_proj.copy_(weight.get(f"{config.adapter_name}.doc_weight", None))
+        self.output_.layers_[config.adapter_name] = output_layer
+        if type(config) is not AdapterConfig:
+            # init transformer layers
+            for transformer_layer in self.model_.layers_:
+                init_lora_layer_weight(transformer_layer, self.config_, config, weight)
+        else:
+            assert weight is None, "can not load basic adapter with weight"
+        return config.adapter_name
+    def get_adapter_weight_dict(self, adapter_name: str) -> Dict[str, torch.Tensor]:
+        # return the lora weight and target_module's name
+        lora_weight_dict = self.output_.layers_[adapter_name].state_dict()
+        atten_name = f"{adapter_name}.attention_mat_weight"
+        lora_weight_dict[atten_name] = self.attention_weight
+        route_name = f"{adapter_name}.router_weight_up"
+        lora_weight_dict[route_name] = self.routerup
+        """route_name = f"{adapter_name}.router_weight_down"
+        lora_weight_dict[route_name] = self.routerdown"""
+        cite_name = f"{adapter_name}.cite_weight"
+        lora_weight_dict[cite_name] = self.cite_output
+        doc_name = f"{adapter_name}.doc_weight"
+        lora_weight_dict[doc_name] = self.doc_proj
+        lora_config = self.adapter_configs_[adapter_name]
+        for transformer_layer in self.model_.layers_:
+            get_lora_layer_weight(transformer_layer, lora_config, lora_weight_dict)
+        return lora_weight_dict
+    def unload_adapter(
+        self, adapter_name: str
+    ) -> Tuple[LoraConfig, Dict[str, torch.Tensor]]:
+        assert adapter_name in self.adapter_configs_, "adapter not exist"
+        lora_weight = self.get_adapter_weight_dict(adapter_name)
+        lora_config = self.adapter_configs_.pop(adapter_name)
+        self.output_.layers_.pop(adapter_name)
+        for transformer_layer in self.model_.layers_:
+            attn_state_dict, mlp_state_dict = transformer_layer.state_dict()
+            attn_state_dict: Dict[str, torch.Tensor]
+            mlp_state_dict: Dict[str, torch.Tensor]
+            lora_layer_list = list(attn_state_dict.values())
+            lora_layer_list.extend(mlp_state_dict.values())
+            for lora_layer in lora_layer_list:
+                if adapter_name in lora_layer.loras_:
+                    lora_layer.loras_.pop(adapter_name, None)
+                elif adapter_name in transformer_layer.mlp_.moes_:
+                    for expert_idx in range(
+                        transformer_layer.mlp_.moes_[adapter_name].experts_
+                    ):
+                        moe_lora_name = f"moe.{adapter_name}.experts.{expert_idx}"
+                        lora_layer.loras_.pop(moe_lora_name, None)
+                    transformer_layer.mlp_.moes_.pop(adapter_name)
+                elif adapter_name in lora_layer.moes_:
+                    for expert_idx in range(lora_layer.moes_[adapter_name].experts_):
+                        moe_lora_name = f"moe.{adapter_name}.experts.{expert_idx}"
+                        lora_layer.loras_.pop(moe_lora_name, None)
+                    lora_layer.moes_.pop(lora_config.adapter_name, None)
+        return lora_config, lora_weight
+    def load_adapter(self, name_or_path: str, adapter_name: Optional[str] = None):
+        if adapter_name is None:
+            adapter_name = name_or_path
+        if not os.path.exists(name_or_path):
+            name_or_path = snapshot_download(repo_id=name_or_path, repo_type="model")
+        with open(
+            name_or_path + os.sep + "adapter_config.json", "r", encoding="utf8"
+        ) as fp:
+            lora_config = lora_config_factory(json.load(fp))
+        lora_config.adapter_name = adapter_name
+        lora_weight = torch.load(
+            name_or_path + os.sep + "adapter_model.bin",
+            map_location=self.device_,
+            weights_only=False,
+        )
+        self.init_adapter(lora_config, lora_weight)
+        return adapter_name
+    def compute_route_loss(self, logits, cites):
+        nrom_logits = logits / torch.norm(logits, dim = -1, keepdim=True)
+        b, l, v = logits.shape
+        """for c in cites:
+            if c[-1] == l:
+                del c[-1]"""
+        label = []
+        for k in range(b):
+            label.append([1 if i in cites[k] else 0 for i in range(l)])
+        if isinstance(label, torch.Tensor):
+            label = (
+                label.clone()
+                .detach()
+                .to(dtype=torch.long, device=logits.device)
+            )
+        else:
+            label = torch.tensor(label, dtype=torch.long, device=logits.device)
+        loss_fn = torch.nn.CrossEntropyLoss()
+        return loss_fn(
+            nrom_logits[..., :-1, :].contiguous().view(-1, v),
+            label[..., 1:].contiguous().view(-1),
+        )
+    def compute_cite_loss2(self, logits, cites, cites_v, docs_pos):
+        b = len(logits)
+        docs_pos = [torch.tensor(i) for i in docs_pos]
+        doc_embeds = []
+        norm_logits = [F.normalize(logits[batch], p = 2, dim = 1) for batch in range(logits.shape[0])]
+        for i in range(b):
+            doc_embeds.append(norm_logits[i][docs_pos[i]].transpose(0,1))
+        b_logits = []
+        for i in range(len(cites)):
+            b_logits.append(norm_logits[i] @ doc_embeds[i])
+        for k in range(len(cites_v)):
+            cites_v[k] = [self.token2id(i) for i in cites_v[k]]
+        labels = []
+        for k in range(b):
+            labels.append([-100 for _ in range(logits[k].shape[0])])
+            for i, v in zip(cites[k], cites_v[k]):
+                labels[k][i] = v - 1
+        if isinstance(labels[0], torch.Tensor):
+            for k in range(b):
+                labels[k] = (
+                    labels[k].clone()
+                    .detach()
+                    .to(dtype=torch.long, device=logits[0].device)
+                )
+        else:
+            for k in range(b):
+                labels[k] = torch.tensor(labels[k], dtype=torch.long, device=logits[0].device)
+        loss_fn = torch.nn.CrossEntropyLoss(ignore_index = -100)
+        loss = 0
+        for k in range(b):
+            if len(cites[k]) != 0:
+                loss += loss_fn(
+                b_logits[k][..., :-1, :].contiguous().view(-1, b_logits[k].shape[-1]),
+                labels[k][..., 1:].contiguous().view(-1),
+        )
+        return loss / b
+    def compute_cite_loss(self, logits, cites, cites_v):
+        b = len(logits)
+        for k in range(len(cites_v)):
+            """if len(cites[k]) > len(cites_v[k]):
+                del cites[k][-1]"""
+            cites_v[k] = [self.token2id(i) for i in cites_v[k]]
+        labels = []
+        for k in range(b):
+            labels.append([-100 for _ in range(logits[k].shape[0])])
+            for i, v in zip(cites[k], cites_v[k]):
+                labels[k][i] = v - 1
+        if isinstance(labels[0], torch.Tensor):
+            for k in range(b):
+                labels[k] = (
+                    labels[k].clone()
+                    .detach()
+                    .to(dtype=torch.long, device=logits[0].device)
+                )
+        else:
+            for k in range(b):
+                labels[k] = torch.tensor(labels[k], dtype=torch.long, device=logits[0].device)
+        loss_fn = torch.nn.CrossEntropyLoss(ignore_index = -100)
+        loss = 0
+        for k in range(b):
+            if len(cites[k]) != 0:
+                loss += loss_fn(
+                logits[k][..., :-1, :].contiguous().view(-1, logits[k].shape[-1]),
+                labels[k][..., 1:].contiguous().view(-1),
+        )
+        return loss / b
+    def attention_loss_fn(self, mat, mask, cites, prompt_len):# cites: T个元素，每个元素代表c_i所在列
+        mat = torch.stack(mat, dim = 0)
+        mat = mat.permute(1,0,3,4,2)
+        #final_mat = torch.matmul(mat, self.attention_weight).squeeze(-1)
+        final_mat = torch.mean(mat, dim = -1)
+        final_mat += mask
+        final_mat = F.softmax(final_mat, dim=-1)
+        loss = torch.tensor(0.0, dtype = final_mat.dtype, device = final_mat.device)
+        num_layer = final_mat.shape[1]
+        for batch in range(final_mat.shape[0]):
+            if len(cites[batch]) == 0:
+                continue
+            for k in range(len(cites[batch]) - 1):
+                for i in range(k + 1):
+                    if cites[batch][k] == cites[batch][k + 1] - 1:
+                        continue
+                    loss_now = (self.alpha[cites[batch][k]:cites[batch][k + 1] - 1] * self.beta[k - i, k]).expand(1, num_layer,-1) - final_mat[batch,:,cites[batch][k]:cites[batch][k + 1] - 1,cites[batch][i]]
+                    loss += F.relu(loss_now).sum() / (cites[batch][k + 1] - cites[batch][k])
+        return loss

c2cite/models/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from .modeling_chatglm import GLMForCausalLM
+from .modeling_gemma import GemmaForCausalLM
+from .modeling_gemma2 import Gemma2ForCausalLM
+from .modeling_llama import LlamaForCausalLM
+from .modeling_mistral import MistralForCausalLM
+from .modeling_mistral import MistralForCausalLM as Qwen2ForCausalLM
+from .modeling_phi import PhiForCausalLM
+from .modeling_phi3 import Phi3ForCausalLM
+model_dict = {
+    "llama": LlamaForCausalLM,
+    "gemma": GemmaForCausalLM,
+    "gemma2": Gemma2ForCausalLM,
+    "mistral": MistralForCausalLM,
+    "qwen2": Qwen2ForCausalLM,
+    "phi": PhiForCausalLM,
+    "phi3": Phi3ForCausalLM,
+    "chatglm": GLMForCausalLM,
+}
+def from_pretrained(llm_model, **kwargs):
+    if llm_model.config.model_type in model_dict:
+        return model_dict[llm_model.config.model_type].from_pretrained(
+            llm_model, **kwargs
+        )
+    else:
+        raise RuntimeError(f"Model {llm_model.config.model_type} not supported.")
+__all__ = [
+    "LlamaForCausalLM",
+    "GemmaForCausalLM",
+    "MistralForCausalLM",
+    "Qwen2ForCausalLM",
+    "PhiForCausalLM",
+    "Phi3ForCausalLM",
+    "from_pretrained",
+    "GLMForCausalLM",
+]

c2cite/models/modeling_chatglm.py ADDED Viewed

	@@ -0,0 +1,855 @@

+import math
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import LayerNorm
+from transformers.utils import is_flash_attn_2_available
+from moe_peft.common import (
+    FeedForward,
+    Linear,
+    LLMAttention,
+    LLMCache,
+    LLMDecoder,
+    LLMFeedForward,
+    LLMForCausalLM,
+    LLMModelConfig,
+    LLMModelInput,
+    collect_plugin_router_logtis,
+    flash_attention_forward,
+    slice_tensor,
+)
+from moe_peft.executors import executor
+from moe_peft.utils import copy_parameters
+@dataclass
+class GLMConfig(LLMModelConfig):
+    post_layer_norm: bool = True
+    rmsnorm: bool = True
+    layernorm_epsilon: float = 1e-5
+    apply_residual_connection_post_layernorm: bool = False
+    fp32_residual_connection: bool = False
+    kv_channels: int = 128
+    multi_query_attention: bool = False
+    multi_query_group_num: int = 2
+    apply_query_key_layer_scaling: bool = True
+    attention_softmax_in_fp32: bool = True
+    original_rope: bool = True
+    add_bias_linear: bool = False
+    padded_vocab_size: int = -1
+    rope_ratio: float = 1
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor,
+    num_partitions: int,
+    contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+    return tensor_list
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, rope_ratio=1, original_impl=False, device=None, dtype=None):
+        super().__init__()
+        inv_freq = 1.0 / (
+            10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim)
+        )
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+        self.rope_ratio = rope_ratio
+    def forward_impl(
+        self,
+        seq_len: int,
+        n_elem: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        base: int = 10000,
+    ):
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        base = base * self.rope_ratio
+        theta = 1.0 / (
+            base
+            ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem)
+        )
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(
+            max_seq_len,
+            self.dim,
+            dtype=self.inv_freq.dtype,
+            device=self.inv_freq.device,
+        )
+@torch.jit.script
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [b, np, sq, hn]
+    b, np, sq, _ = x.shape
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:, :sq]
+    xshaped = x.reshape(b, np, sq, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
+        super().__init__()
+        self.weight = torch.nn.Parameter(
+            torch.empty(normalized_shape, device=device, dtype=dtype)
+        )
+        self.eps = eps
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        return (self.weight * hidden_states).to(input_dtype)
+class CoreAttention(torch.nn.Module):
+    def __init__(self, config: GLMConfig, layer_number):
+        super(CoreAttention, self).__init__()
+        self.config = config
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        self.is_causal = True
+        projection_size = config.kv_channels * config.n_heads_
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = projection_size // config.n_heads_
+        self.num_attention_heads_per_partition = config.n_heads_
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        # [b, np, sq, sk]
+        output_size = (
+            query_layer.size(0),
+            query_layer.size(1),
+            query_layer.size(2),
+            key_layer.size(2),
+        )
+        # [b, np, sq, hn] -> [b * np, sq, hn]
+        query_layer = query_layer.view(
+            output_size[0] * output_size[1], output_size[2], -1
+        )
+        # [b, np, sk, hn] -> [b * np, sk, hn]
+        key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1)
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = torch.empty(
+            output_size[0] * output_size[1],
+            output_size[2],
+            output_size[3],
+            dtype=query_layer.dtype,
+            device=query_layer.device,
+        )
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer,  # [b * np, sq, hn]
+            key_layer.transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor),
+        )
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+        # attention scores and attention mask [b, np, sq, sk]
+        if self.attention_softmax_in_fp32:
+            attention_scores = attention_scores.float()
+        if self.coeff is not None:
+            attention_scores = attention_scores * self.coeff
+        if (
+            attention_mask is None
+            and attention_scores.shape[2] == attention_scores.shape[3]
+        ):
+            attention_mask = torch.ones(
+                output_size[0],
+                1,
+                output_size[2],
+                output_size[3],
+                device=attention_scores.device,
+                dtype=torch.bool,
+            )
+            attention_mask.tril_()
+            attention_mask = ~attention_mask
+        if attention_mask is not None:
+            attention_scores = attention_scores.masked_fill(
+                attention_mask, float("-inf")
+            )
+        attention_probs = F.softmax(attention_scores, dim=-1)
+        attention_probs = attention_probs.type_as(value_layer)
+        # query layer shape: [b * np, sq, hn]
+        # value layer shape: [b, np, sk, hn]
+        # attention shape: [b, np, sq, sk]
+        # context layer shape: [b, np, sq, hn]
+        output_size = (
+            value_layer.size(0),
+            value_layer.size(1),
+            query_layer.size(1),
+            value_layer.size(3),
+        )
+        # change view [b * np, sk, hn]
+        value_layer = value_layer.view(
+            output_size[0] * output_size[1], value_layer.size(2), -1
+        )
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(
+            output_size[0] * output_size[1], output_size[2], -1
+        )
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+        # [b, np, sq, hn] --> [b, sq, np, hn]
+        context_layer = context_layer.transpose(1, 2).contiguous()
+        # [b, sq, np, hn] --> [b, sq, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.hidden_size_per_partition,
+        )
+        context_layer = context_layer.reshape(*new_context_layer_shape)
+        return context_layer
+class FlashAttention2(CoreAttention):
+    def __init__(self, *args, **kwargs):
+        assert is_flash_attn_2_available(), "Flash Attention is not available"
+        super().__init__(*args, **kwargs)
+    def forward(self, query_states, key_states, value_states, attention_mask):
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        batch_size, query_length = query_states.shape[:2]
+        attn_output = flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            query_length,
+            is_causal=self.is_causal,
+        )
+        attn_output = attn_output.reshape(
+            batch_size, query_length, self.hidden_size_per_partition
+        ).contiguous()
+        return attn_output
+CORE_ATTENTION_CLASSES = {
+    "eager": CoreAttention,
+    "flash_attn": FlashAttention2,
+}
+class GLMSelfAttention(LLMAttention):
+    def __init__(
+        self,
+        qkv_layer: torch.nn.Module,
+        dense_layer: torch.nn.Module,
+        config: GLMConfig,
+        layer_idx,
+    ):
+        super(GLMSelfAttention, self).__init__()
+        self.layer_idx = layer_idx
+        self.projection_size = config.kv_channels * config.n_heads_
+        # Per attention head and per-partition values.
+        self.hidden_size_per_attention_head = self.projection_size // config.n_heads_
+        self.num_attention_heads_per_partition = config.n_heads_
+        self.multi_query_attention = config.multi_query_attention
+        self.qkv_hidden_size = 3 * self.projection_size
+        if self.multi_query_attention:
+            self.num_multi_query_groups_per_partition = config.multi_query_group_num
+            self.qkv_hidden_size = (
+                self.projection_size
+                + 2
+                * self.hidden_size_per_attention_head
+                * self.num_multi_query_groups_per_partition
+            )
+        # QKV layer.
+        self.query_key_value = Linear(base_layer=qkv_layer, device=config.device_)
+        # Core attention layer.
+        self.core_attention = CORE_ATTENTION_CLASSES[config.attn_implementation_](
+            config, self.layer_idx
+        )
+        # Dense layer.
+        self.dense = Linear(base_layer=dense_layer, device=config.device_)
+    def state_dict(self) -> Dict[str, Linear]:
+        return {"qkv_proj": self.query_key_value, "dense": self.dense}
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_pos_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        mixed_x_layer = self.query_key_value(hidden_states, input_args)
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition
+                    * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition
+                    * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition
+                    * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1]
+                + (
+                    self.num_attention_heads_per_partition,
+                    self.hidden_size_per_attention_head,
+                )
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1]
+                + (
+                    self.num_multi_query_groups_per_partition,
+                    self.hidden_size_per_attention_head,
+                )
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (
+                    self.num_multi_query_groups_per_partition,
+                    self.hidden_size_per_attention_head,
+                )
+            )
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+                self.num_attention_heads_per_partition,
+                3 * self.hidden_size_per_attention_head,
+            )
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+            # [b, sq, np, 3 * hn] --> 3 [b, sq, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(
+                mixed_x_layer, 3
+            )
+        # [b, sq, np, hn] -> [b, np, sq, hn]
+        query_layer, key_layer, value_layer = [
+            k.transpose(1, 2) for k in [query_layer, key_layer, value_layer]
+        ]
+        # apply relative positional encoding (rotary embedding)
+        query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+        key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+        if past_key_value is not None:
+            key_layer, value_layer = past_key_value.update(
+                key_layer,
+                value_layer,
+                self.layer_idx,
+                {"cache_position": cache_position},
+            )
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(2)
+            key_layer = key_layer.expand(
+                -1,
+                -1,
+                self.num_attention_heads_per_partition
+                // self.num_multi_query_groups_per_partition,
+                -1,
+                -1,
+            )
+            key_layer = key_layer.contiguous().view(
+                key_layer.size()[:1]
+                + (self.num_attention_heads_per_partition,)
+                + key_layer.size()[3:]
+            )
+            value_layer = value_layer.unsqueeze(2)
+            value_layer = value_layer.expand(
+                -1,
+                -1,
+                self.num_attention_heads_per_partition
+                // self.num_multi_query_groups_per_partition,
+                -1,
+                -1,
+            )
+            value_layer = value_layer.contiguous().view(
+                value_layer.size()[:1]
+                + (self.num_attention_heads_per_partition,)
+                + value_layer.size()[3:]
+            )
+        context_layer = self.core_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            attention_mask,
+        )
+        output = self.dense(context_layer, input_args)
+        return output
+def swiglu(x):
+    x = torch.chunk(x, 2, dim=-1)
+    return F.silu(x[0]) * x[1]
+class GLMMLP(LLMFeedForward):
+    def __init__(
+        self,
+        dense_h_to_4h: torch.nn.Module,
+        dense_4h_to_h: torch.nn.Module,
+        config: GLMConfig,
+    ) -> None:
+        super().__init__()
+        self.dense_h_to_4h: Linear = Linear(dense_h_to_4h, config.device_)
+        self.dense_4h_to_h: Linear = Linear(dense_4h_to_h, config.device_)
+        self.activation_func = swiglu
+    def state_dict(self) -> Dict[str, torch.nn.Module]:
+        return {
+            "dense_h_to_4h": self.dense_h_to_4h,
+            "dense_4h_to_h": self.dense_4h_to_h,
+        }
+    def _batch_forward(
+        self, data: torch.Tensor, input_args: LLMModelInput
+    ) -> torch.Tensor:
+        # [b, sq, h] -> [b, sq, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(data, input_args)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [b, sq, 4hp] -> [b, sq, h]
+        output = self.dense_4h_to_h(intermediate_parallel, input_args)
+        return output
+    def _lora_forward(
+        self, lora_name: str, act_fn: torch.nn.Module, hidden_states: torch.Tensor
+    ) -> torch.Tensor:
+        if lora_name in self.dense_h_to_4h.loras_:
+            hidden_states = self.dense_h_to_4h.loras_[lora_name].forward(
+                self.dense_h_to_4h.base_layer_.forward(hidden_states), hidden_states
+            )
+        else:
+            hidden_states = self.dense_h_to_4h.base_layer_.forward(hidden_states)
+        hidden_states = self.activation_func(hidden_states)
+        if lora_name in self.dense_4h_to_h.loras_:
+            hidden_states = self.dense_4h_to_h.loras_[lora_name].forward(
+                self.dense_4h_to_h.base_layer_.forward(hidden_states), hidden_states
+            )
+        else:
+            hidden_states = self.dense_4h_to_h.base_layer_.forward(hidden_states)
+        return hidden_states
+    def _mixlora_forward(
+        self, moe_name, act_fn, expert_mask, hidden_states, input_dtype
+    ):
+        common_dense_h_to_4h = self.dense_h_to_4h.base_layer_.forward(
+            hidden_states.to(input_dtype)
+        ).to(hidden_states.dtype)
+        final_expert_states = []
+        for expert_idx in range(expert_mask.shape[0]):
+            _, top_x = torch.where(expert_mask[expert_idx])
+            lora_name = f"moe.{moe_name}.experts.{expert_idx}"
+            if lora_name in self.dense_h_to_4h.loras_:
+                lora_data = slice_tensor(hidden_states, top_x, input_dtype)
+                act_result = self.activation_func(
+                    self.dense_h_to_4h.loras_[lora_name].forward(
+                        slice_tensor(common_dense_h_to_4h, top_x, input_dtype),
+                        lora_data,
+                    )
+                )
+            else:
+                act_result = self.activation_func(
+                    slice_tensor(common_dense_h_to_4h, top_x, input_dtype)
+                )
+            if lora_name in self.dense_4h_to_h.loras_:
+                final_expert_states.append(
+                    self.dense_4h_to_h.loras_[lora_name].forward(
+                        self.dense_4h_to_h.base_layer_.forward(act_result), act_result
+                    )
+                )
+            else:
+                final_expert_states.append(
+                    self.dense_4h_to_h.base_layer_.forward(act_result)
+                )
+        return final_expert_states
+class GLMDecoderLayer(LLMDecoder):
+    def __init__(
+        self, self_attn: GLMSelfAttention, mlp: FeedForward, config: GLMConfig
+    ) -> None:
+        super().__init__()
+        self.layer_id_ = self_attn.layer_idx
+        self.apply_residual_connection_post_layernorm = (
+            config.apply_residual_connection_post_layernorm
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        # Input layer norm.
+        self.input_layernorm = LayerNormFunc(
+            config.dim_,
+            eps=config.layernorm_epsilon,
+            device=config.device_,
+            dtype=config.dtype_,
+        )
+        # Self-attention layer.
+        self.self_attn_: GLMSelfAttention = self_attn
+        self.hidden_dropout = config.hidden_dropout_
+        # Post attention layer norm.
+        self.post_layernorm = LayerNormFunc(
+            config.dim_,
+            eps=config.layernorm_epsilon,
+            device=config.device_,
+            dtype=config.dtype_,
+        )
+        # mlp
+        self.mlp_: FeedForward = mlp
+    def state_dict(self) -> Tuple[Dict[str, nn.Module], Dict[str, nn.Module]]:
+        return self.self_attn_.state_dict(), self.mlp_.state_dict()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_pos_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        layernorm_output = self.input_layernorm(hidden_states)
+        attention_output = self.self_attn_.forward(
+            layernorm_output,
+            input_args,
+            rotary_pos_emb,
+            attention_mask,
+            cache_position,
+            past_key_value,
+        )
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+        layernorm_input = F.dropout(
+            attention_output,
+            p=self.hidden_dropout,
+            training=not input_args.inference_mode_,
+        )
+        layernorm_input = residual + layernorm_input
+        # Layer norm post the self attention.
+        layernorm_output = self.post_layernorm(layernorm_input)
+        # MLP.
+        mlp_output, router_logits = self.mlp_(layernorm_output, input_args)
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+        output = F.dropout(
+            mlp_output, p=self.hidden_dropout, training=not input_args.inference_mode_
+        )
+        output = residual + output
+        if input_args.output_router_logits_:
+            router_logits = collect_plugin_router_logtis(
+                router_logits, input_args, self
+            )
+        return output, *router_logits
+class GLMEmbedding(torch.nn.Module):
+    def __init__(self, config: GLMConfig):
+        super(GLMEmbedding, self).__init__()
+        self.hidden_size = config.dim_
+        # Word embeddings (parallel).
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size,
+            self.hidden_size,
+            dtype=config.dtype_,
+            device=config.device_,
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+        return embeddings
+class GLMForCausalLM(LLMForCausalLM):
+    def __init__(self, config: GLMConfig) -> None:
+        self.config_ = config
+        self.padding_idx_ = config.pad_token_id_
+        self.vocab_size_ = config.vocab_size_
+        # Embedding layer.
+        self.embed_tokens_ = GLMEmbedding(config)
+        # Rotary Position Embedding.
+        self.rotary_emb_layer: RotaryEmbedding = None
+        # Encoder(Decoder) layers.
+        self.layers_: List[GLMDecoderLayer] = []
+        # Final layer norm.
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        if self.config_.post_layer_norm:
+            self.final_layernorm_ = LayerNormFunc(
+                config.dim_,
+                eps=config.layernorm_epsilon,
+                device=config.device_,
+                dtype=config.dtype_,
+            )
+        else:
+            self.final_layernorm_ = nn.Identity()
+        # Output layer.
+        self.lm_head_ = torch.nn.Linear(
+            config.dim_,
+            config.vocab_size_,
+            bias=config.add_bias_linear,
+            dtype=config.dtype_,
+            device=config.device_,
+        )
+    def embed_tokens(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens_(input_ids)
+    def rotary_embed(
+        self, input_tensor: torch.Tensor, position_ids: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.rotary_emb_layer(max_seq_len=self.config_.max_seq_len_)[
+            None, position_ids[-1]
+        ]
+    def decoder_stack(self) -> List[LLMDecoder]:
+        return self.layers_
+    def norm(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.final_layernorm_(hidden_states)
+    def get_masks(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: LLMCache,
+        padding_mask: torch.Tensor,
+    ):
+        batch_size, seq_length, _ = input_ids.shape
+        full_attention_mask = torch.ones(
+            batch_size, seq_length, seq_length, device=input_ids.device
+        )
+        full_attention_mask.tril_()
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values.get_seq_length()
+        if past_length:
+            full_attention_mask = torch.cat(
+                (
+                    torch.ones(
+                        batch_size, seq_length, past_length, device=input_ids.device
+                    ),
+                    full_attention_mask,
+                ),
+                dim=-1,
+            )
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+        if not past_length and padding_mask is not None:
+            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+    def causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Optional[LLMCache],
+    ) -> torch.Tensor:
+        return self.get_masks(input_tensor, past_key_values, attention_mask)
+    def model_config(self) -> GLMConfig:
+        return self.config_
+    @staticmethod
+    def from_pretrained(
+        llm_model,
+        attn_impl: str = "eager",
+        use_sliding_window: bool = False,
+        device: str = executor.default_device_name(),
+    ):
+        assert not use_sliding_window, "ChatGLM model does not support SWA."
+        # Get the config from LLM model and input args.
+        llm_config = llm_model.config
+        config = GLMConfig(
+            # LLM model args.
+            name_or_path_=llm_config._name_or_path,
+            device_=device,
+            dim_=llm_config.hidden_size,
+            head_dim_=llm_config.hidden_size // llm_config.num_attention_heads,
+            n_heads_=llm_config.num_attention_heads,
+            n_kv_heads_=llm_config.multi_query_group_num,
+            n_layers_=llm_config.num_layers,
+            hidden_act_=swiglu,
+            hidden_dropout_=llm_config.hidden_dropout,
+            vocab_size_=llm_config.vocab_size,
+            pad_token_id_=llm_config.pad_token_id,
+            max_seq_len_=llm_config.seq_length,
+            attn_implementation_=attn_impl,
+            dtype_=llm_model.dtype,
+            # ChatGLM args.
+            post_layer_norm=llm_config.post_layer_norm,
+            rmsnorm=llm_config.rmsnorm,
+            layernorm_epsilon=llm_config.layernorm_epsilon,
+            apply_residual_connection_post_layernorm=llm_config.apply_residual_connection_post_layernorm,
+            fp32_residual_connection=llm_config.fp32_residual_connection,
+            apply_query_key_layer_scaling=llm_config.apply_query_key_layer_scaling,
+            kv_channels=llm_config.kv_channels,
+            multi_query_attention=llm_config.multi_query_attention,
+            multi_query_group_num=llm_config.multi_query_group_num,
+            attention_softmax_in_fp32=llm_config.attention_softmax_in_fp32,
+            original_rope=llm_config.original_rope,
+            add_bias_linear=llm_config.add_bias_linear,
+            padded_vocab_size=llm_config.padded_vocab_size,
+            rope_ratio=(
+                llm_config.rope_ratio if hasattr(llm_config, "rope_ratio") else 1
+            ),
+        )
+        model = GLMForCausalLM(config)
+        llm_model.requires_grad_(False)
+        copy_parameters(
+            llm_model.transformer.embedding,
+            model.embed_tokens_,
+        )
+        rotary_dim = (
+            config.dim_ // config.n_heads_
+            if config.kv_channels is None
+            else config.kv_channels
+        )
+        model.rotary_emb_layer = RotaryEmbedding(
+            dim=rotary_dim // 2,
+            rope_ratio=config.rope_ratio,
+            original_impl=config.original_rope,
+            device=device,
+            dtype=config.dtype_,
+        )
+        for idx, layer in enumerate(llm_model.transformer.encoder.layers):
+            # Get self-attention layer.
+            self_attention = GLMSelfAttention(
+                qkv_layer=layer.self_attention.query_key_value,
+                dense_layer=layer.self_attention.dense,
+                config=config,
+                layer_idx=idx,
+            )
+            # Get MLP layer.
+            mlp = FeedForward(
+                GLMMLP(layer.mlp.dense_h_to_4h, layer.mlp.dense_4h_to_h, config=config)
+            )
+            # Create a transformer block.
+            encoder = GLMDecoderLayer(self_attention, mlp, config)
+            copy_parameters(layer.input_layernorm, encoder.input_layernorm)
+            copy_parameters(layer.post_attention_layernorm, encoder.post_layernorm)
+            model.layers_.append(encoder)
+        if config.post_layer_norm:
+            copy_parameters(
+                llm_model.transformer.encoder.final_layernorm,
+                model.final_layernorm_,
+            )
+        copy_parameters(llm_model.transformer.output_layer, model.lm_head_)
+        return model

c2cite/models/modeling_gemma.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.models.gemma import modeling_gemma
+from moe_peft.common import FeedForward
+from moe_peft.executors import executor
+from moe_peft.models.modeling_llama import (
+    LLAMA_ATTENTION_CLASSES as GEMMA_ATTENTION_CLASSES,
+)
+from moe_peft.models.modeling_llama import (
+    LlamaConfig,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaMLP,
+)
+from moe_peft.utils import copy_parameters
+class GemmaRMSNorm(nn.Module):
+    def __init__(self, weight: torch.Tensor, eps: float = 1e-6):
+        super().__init__()
+        self.norm_eps_ = eps
+        self.weight_ = weight
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.norm_eps_)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self._norm(x.to(torch.float32))
+        # Llama does x.to(float16) * w whilst Gemma is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight_.to(torch.float32))
+        return output.to(x.dtype)
+class GemmaEmbedding(nn.Module):
+    def __init__(self, embedding: torch.Tensor, pad_token: int, normalizer: float):
+        super().__init__()
+        self.token_embedding_: torch.Tensor = embedding
+        self.padding_idx_: int = pad_token
+        self.normalizer_: float = normalizer
+    def forward(self, tokens: torch.Tensor) -> torch.Tensor:
+        data = F.embedding(tokens, self.token_embedding_, padding_idx=self.padding_idx_)
+        # normalized
+        # Gemma downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = torch.tensor(self.normalizer_, dtype=data.dtype)
+        return data * normalizer
+def _patch_hidden_act(config: modeling_gemma.GemmaConfig) -> str:
+    if hasattr(config, "hidden_activation") and config.hidden_activation is not None:
+        return config.hidden_activation
+    else:
+        return config.hidden_act
+class GemmaForCausalLM(LlamaForCausalLM):
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__(config)
+    @staticmethod
+    def from_pretrained(
+        llm_model: modeling_gemma.GemmaForCausalLM,
+        attn_impl: str = "eager",
+        use_sliding_window: bool = False,
+        device: str = executor.default_device_name(),
+    ):
+        assert not use_sliding_window, "Gemma model does not support SWA."
+        llm_config: modeling_gemma.GemmaConfig = llm_model.config
+        llm_args = LlamaConfig(
+            name_or_path_=llm_config.name_or_path,
+            vocab_size_=llm_config.vocab_size,
+            dim_=llm_config.hidden_size,
+            head_dim_=llm_config.head_dim,
+            intermediate_=llm_config.intermediate_size,
+            n_layers_=llm_config.num_hidden_layers,
+            n_heads_=llm_config.num_attention_heads,
+            n_kv_heads_=llm_config.num_key_value_heads,
+            hidden_act_=_patch_hidden_act(llm_config),
+            rms_norm_eps_=llm_config.rms_norm_eps,
+            max_seq_len_=llm_config.max_position_embeddings,
+            rope_theta_=llm_config.rope_theta,
+            pad_token_id_=llm_config.pad_token_id,
+            attn_implementation_=attn_impl,
+            device_=torch.device(device),
+            dtype_=llm_model.dtype,
+        )
+        if llm_args.pad_token_id_ is None:
+            llm_args.pad_token_id_ = -1
+        model = GemmaForCausalLM(llm_args)
+        llm_model.requires_grad_(False)
+        model.embed_tokens_ = GemmaEmbedding(
+            llm_model.model.embed_tokens.weight,
+            llm_args.pad_token_id_,
+            llm_args.dim_**0.5,
+        )
+        model.norm_ = GemmaRMSNorm(llm_model.model.norm.weight, llm_args.rms_norm_eps_)
+        copy_parameters(llm_model.lm_head, model.lm_head_)
+        for idx, layer in enumerate(llm_model.model.layers):
+            decoder = LlamaDecoderLayer(idx)
+            decoder.self_attn_ = GEMMA_ATTENTION_CLASSES[llm_args.attn_implementation_](
+                layer.self_attn.q_proj,
+                layer.self_attn.k_proj,
+                layer.self_attn.v_proj,
+                layer.self_attn.o_proj,
+                idx,
+                llm_args,
+            )
+            decoder.mlp_ = FeedForward(
+                LlamaMLP(
+                    layer.mlp.gate_proj,
+                    layer.mlp.down_proj,
+                    layer.mlp.up_proj,
+                    llm_args,
+                )
+            )
+            decoder.input_layernorm_ = GemmaRMSNorm(
+                layer.input_layernorm.weight, llm_args.rms_norm_eps_
+            )
+            decoder.post_attention_layernorm_ = GemmaRMSNorm(
+                layer.post_attention_layernorm.weight, llm_args.rms_norm_eps_
+            )
+            model.layers_.append(decoder)
+        return model

c2cite/models/modeling_gemma2.py ADDED Viewed

	@@ -0,0 +1,528 @@

+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+from transformers.models.gemma2 import modeling_gemma2
+from transformers.models.gemma2.modeling_gemma2 import apply_rotary_pos_emb, repeat_kv
+from transformers.utils import is_flash_attn_2_available
+from moe_peft.common import (
+    FeedForward,
+    Linear,
+    LLMAttention,
+    LLMCache,
+    LLMDecoder,
+    LLMForCausalLM,
+    LLMModelConfig,
+    LLMModelInput,
+    collect_plugin_router_logtis,
+    flash_attention_forward,
+    prepare_4d_causal_attention_mask,
+)
+from moe_peft.executors import executor
+from moe_peft.models.modeling_gemma import GemmaEmbedding, GemmaRMSNorm
+from moe_peft.models.modeling_llama import LlamaMLP
+from moe_peft.utils import copy_parameters, is_package_available
+@dataclass
+class Gemma2Config(LLMModelConfig):
+    rms_norm_eps_: float = 1e-6
+    attn_logit_softcapping_: float = 50.0
+    final_logit_softcapping_: float = 30.0
+    query_pre_attn_scalar_: int = 224
+    use_sliding_window_: bool = False
+    sliding_window_: int = 4096
+class Gemma2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base
+            ** (
+                torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device)
+                / self.dim
+            )
+        )
+        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        self.inv_freq.to(x.device)
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = (
+            device_type
+            if isinstance(device_type, str) and device_type != "mps"
+            else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (
+                inv_freq_expanded.float() @ position_ids_expanded.float()
+            ).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Multi-headed attention from 'Attention Is All You Need' paper.
+class Gemma2Attention(LLMAttention):
+    def __init__(
+        self,
+        q_proj: nn.Module,
+        k_proj: nn.Module,
+        v_proj: nn.Module,
+        o_proj: nn.Module,
+        layer_idx: int,
+        config: Gemma2Config,
+    ):
+        super().__init__()
+        # attention
+        self.q_proj_: Linear = Linear(q_proj, config.device_)
+        self.k_proj_: Linear = Linear(k_proj, config.device_)
+        self.v_proj_: Linear = Linear(v_proj, config.device_)
+        self.o_proj_: Linear = Linear(o_proj, config.device_)
+        # config
+        self.layer_idx_ = layer_idx
+        self.config_ = config
+        self.dim_ = config.dim_
+        self.n_heads_ = config.n_heads_
+        self.n_kv_heads_ = config.n_kv_heads_
+        self.n_rep_ = self.n_heads_ // self.n_kv_heads_
+        self.head_dim_ = config.head_dim_
+        self.dtype_ = config.dtype_
+        self.is_causal_ = True
+        self.scaling_ = config.query_pre_attn_scalar_**-0.5
+        self.sliding_window_ = (
+            config.sliding_window_
+            if config.use_sliding_window_ and not bool(layer_idx % 2)
+            else None
+        )
+    def state_dict(self) -> Dict[str, Linear]:
+        return {
+            "q_proj": self.q_proj_,
+            "k_proj": self.k_proj_,
+            "v_proj": self.v_proj_,
+            "o_proj": self.o_proj_,
+        }
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj_(hidden_states, input_args)
+        key_states = self.k_proj_(hidden_states, input_args)
+        value_states = self.v_proj_(hidden_states, input_args)
+        query_states = query_states.view(
+            bsz, q_len, self.n_heads_, self.head_dim_
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        cos, sin = rotary_emb
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin
+        )
+        if past_key_value is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "sliding_window": self.sliding_window_,
+                "cache_position": cache_position,
+            }
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx_, cache_kwargs
+            )
+        key_states = repeat_kv(key_states, self.n_rep_)
+        value_states = repeat_kv(value_states, self.n_rep_)
+        attn_weights = (
+            torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling_
+        )
+        if self.config_.attn_logit_softcapping_ is not None:
+            attn_weights = attn_weights / self.config_.attn_logit_softcapping_
+            attn_weights = torch.tanh(attn_weights)
+            attn_weights = attn_weights * self.config_.attn_logit_softcapping_
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        return self.o_proj_(attn_output, input_args)
+class Gemma2FlashAttention2(Gemma2Attention):
+    def __init__(
+        self,
+        q_proj: nn.Module,
+        k_proj: nn.Module,
+        v_proj: nn.Module,
+        o_proj: nn.Module,
+        layer_idx: int,
+        config: Gemma2Config,
+    ):
+        assert is_flash_attn_2_available(), "Flash Attention is not available"
+        super().__init__(q_proj, k_proj, v_proj, o_proj, layer_idx, config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj_(hidden_states, input_args)
+        key_states = self.k_proj_(hidden_states, input_args)
+        value_states = self.v_proj_(hidden_states, input_args)
+        query_states = query_states.view(
+            bsz, q_len, self.n_heads_, self.head_dim_
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        cos, sin = rotary_emb
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin
+        )
+        if past_key_value is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "sliding_window": self.sliding_window_,
+                "cache_position": cache_position,
+            }
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx_, cache_kwargs
+            )
+        if attention_mask is not None:
+            seq_len = attention_mask.shape[1]
+            key_states = key_states[:, :, :seq_len]
+            value_states = value_states[:, :, :seq_len]
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if executor.is_bf16_supported():
+                target_dtype = torch.bfloat16
+            else:
+                target_dtype = torch.float16
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            is_causal=self.is_causal_,
+            softmax_scale=self.scaling_,
+            sliding_window=(
+                self.sliding_window_ if self.config_.use_sliding_window_ else None
+            ),
+            softcap=(
+                self.config_.attn_logit_softcapping_
+                if is_package_available("flash_attn", "2.6.0")
+                else None
+            ),
+        ).to(input_dtype)
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj_(attn_output, input_args)
+        return attn_output
+GEMMA2_ATTENTION_CLASSES = {
+    "eager": Gemma2Attention,
+    "flash_attn": Gemma2FlashAttention2,
+}
+class Gemma2DecoderLayer(LLMDecoder):
+    def __init__(self, layer_idx: int, config: Gemma2Config) -> None:
+        super().__init__()
+        self.layer_id_: int = layer_idx
+        self.self_attn_: Gemma2Attention = None
+        self.mlp_: FeedForward = None
+        self.input_layernorm_: GemmaRMSNorm = None
+        self.post_attention_layernorm_: GemmaRMSNorm = None
+        self.config_ = config
+        self.is_sliding_ = not bool(layer_idx % 2)
+        self.pre_feedforward_layernorm_: GemmaRMSNorm = None
+        self.post_feedforward_layernorm_: GemmaRMSNorm = None
+        self.sliding_window_ = config.sliding_window_
+    def state_dict(self) -> Tuple[Dict[str, nn.Module], Dict[str, nn.Module]]:
+        return self.self_attn_.state_dict(), self.mlp_.state_dict()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        if (
+            self.config_.use_sliding_window_
+            and self.is_sliding_
+            and attention_mask is not None
+        ):
+            if self.config_.attn_implementation_ == "flash_attn":
+                if past_key_value is not None:  # when decoding
+                    attention_mask = attention_mask[:, -self.sliding_window :]
+            else:
+                min_dtype = torch.finfo(hidden_states.dtype).min
+                sliding_window_mask = torch.tril(
+                    torch.ones_like(attention_mask, dtype=torch.bool),
+                    diagonal=-self.sliding_window_,
+                )
+                attention_mask = torch.where(
+                    sliding_window_mask, min_dtype, attention_mask
+                )
+                if attention_mask.shape[-1] <= 1:  # when decoding
+                    attention_mask = attention_mask[:, :, :, -self.sliding_window_ :]
+        residual = hidden_states
+        hidden_states = self.input_layernorm_(hidden_states)
+        hidden_states = self.self_attn_.forward(
+            hidden_states,
+            input_args,
+            rotary_emb,
+            attention_mask,
+            cache_position,
+            past_key_value,
+        )
+        hidden_states = self.post_attention_layernorm_(hidden_states)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm_(hidden_states)
+        hidden_states, router_logits = self.mlp_.forward(hidden_states, input_args)
+        hidden_states = self.post_feedforward_layernorm_(hidden_states)
+        hidden_states = residual + hidden_states
+        if input_args.output_router_logits_:
+            router_logits = collect_plugin_router_logtis(
+                router_logits, input_args, self
+            )
+        return hidden_states, *router_logits
+class Gemma2OutputLayer(nn.Module):
+    def __init__(self, config: Gemma2Config):
+        super().__init__()
+        self.lm_head_ = nn.Linear(
+            config.dim_,
+            config.vocab_size_,
+            bias=False,
+            dtype=config.dtype_,
+            device=config.device_,
+        )
+        self.final_logit_softcapping_ = config.final_logit_softcapping_
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        logits = self.lm_head_(hidden_states)
+        if self.final_logit_softcapping_ is not None:
+            logits = logits / self.final_logit_softcapping_
+            logits = torch.tanh(logits)
+            logits = logits * self.final_logit_softcapping_
+        return logits
+class Gemma2ForCausalLM(LLMForCausalLM):
+    def __init__(self, config: Gemma2Config) -> None:
+        super().__init__()
+        self.config_ = config
+        self.padding_idx_ = config.pad_token_id_
+        self.vocab_size_ = config.vocab_size_
+        self.embed_tokens_: GemmaEmbedding = None
+        self.norm_: GemmaRMSNorm = None
+        self.rotary_emb_ = Gemma2RotaryEmbedding(
+            config.head_dim_,
+            max_position_embeddings=config.max_seq_len_,
+            base=config.rope_theta_,
+            device=config.device_,
+        )
+        self.lm_head_ = Gemma2OutputLayer(config)
+        self.layers_: List[Gemma2DecoderLayer] = []
+    def embed_tokens(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens_(input_ids)
+    def rotary_embed(
+        self, input_tensor: torch.Tensor, position_ids: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.rotary_emb_(input_tensor, position_ids)
+    def decoder_stack(self) -> List[LLMDecoder]:
+        return self.layers_
+    def norm(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.norm_(hidden_states)
+    def causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Optional[LLMCache],
+    ) -> torch.Tensor:
+        return prepare_4d_causal_attention_mask(
+            attention_mask,
+            input_tensor,
+            cache_position,
+            past_key_values,
+        )
+    def cache_implementation(self) -> str:
+        if self.config_.use_sliding_window_ and self.config_.sliding_window_:
+            return "hybrid"
+        else:
+            return "dynamic"
+    def model_config(self) -> Gemma2Config:
+        return self.config_
+    @staticmethod
+    def from_pretrained(
+        llm_model: modeling_gemma2.Gemma2PreTrainedModel,
+        attn_impl: str = "eager",
+        use_sliding_window: bool = False,
+        device: str = executor.default_device_name(),
+    ):
+        llm_config: modeling_gemma2.Gemma2Config = llm_model.config
+        model_config = Gemma2Config(
+            name_or_path_=llm_config.name_or_path,
+            vocab_size_=llm_config.vocab_size,
+            dim_=llm_config.hidden_size,
+            head_dim_=llm_config.head_dim,
+            intermediate_=llm_config.intermediate_size,
+            n_layers_=llm_config.num_hidden_layers,
+            n_heads_=llm_config.num_attention_heads,
+            n_kv_heads_=llm_config.num_key_value_heads,
+            hidden_act_=llm_config.hidden_activation,
+            rms_norm_eps_=llm_config.rms_norm_eps,
+            max_seq_len_=llm_config.max_position_embeddings,
+            rope_theta_=llm_config.rope_theta,
+            attn_logit_softcapping_=llm_config.attn_logit_softcapping,
+            final_logit_softcapping_=llm_config.final_logit_softcapping,
+            query_pre_attn_scalar_=llm_config.query_pre_attn_scalar,
+            pad_token_id_=llm_config.pad_token_id,
+            attn_implementation_=attn_impl,
+            use_sliding_window_=use_sliding_window,
+            sliding_window_=llm_config.sliding_window,
+            device_=torch.device(device),
+            dtype_=llm_model.dtype,
+        )
+        if model_config.pad_token_id_ is None:
+            model_config.pad_token_id_ = -1
+        model = Gemma2ForCausalLM(model_config)
+        llm_model.requires_grad_(False)
+        model.embed_tokens_ = GemmaEmbedding(
+            llm_model.model.embed_tokens.weight,
+            model_config.pad_token_id_,
+            model_config.dim_**0.5,
+        )
+        model.norm_ = GemmaRMSNorm(
+            llm_model.model.norm.weight, model_config.rms_norm_eps_
+        )
+        copy_parameters(llm_model.lm_head, model.lm_head_.lm_head_)
+        for layer_idx, layer in enumerate(llm_model.model.layers):
+            decoder = Gemma2DecoderLayer(layer_idx, model_config)
+            decoder.self_attn_ = GEMMA2_ATTENTION_CLASSES[
+                model_config.attn_implementation_
+            ](
+                layer.self_attn.q_proj,
+                layer.self_attn.k_proj,
+                layer.self_attn.v_proj,
+                layer.self_attn.o_proj,
+                layer_idx,
+                model_config,
+            )
+            decoder.mlp_ = FeedForward(
+                LlamaMLP(
+                    layer.mlp.gate_proj,
+                    layer.mlp.down_proj,
+                    layer.mlp.up_proj,
+                    model_config,
+                )
+            )
+            decoder.input_layernorm_ = GemmaRMSNorm(
+                layer.input_layernorm.weight, model_config.rms_norm_eps_
+            )
+            decoder.post_attention_layernorm_ = GemmaRMSNorm(
+                layer.post_attention_layernorm.weight, model_config.rms_norm_eps_
+            )
+            decoder.pre_feedforward_layernorm_ = GemmaRMSNorm(
+                layer.pre_feedforward_layernorm.weight, model_config.rms_norm_eps_
+            )
+            decoder.post_feedforward_layernorm_ = GemmaRMSNorm(
+                layer.post_feedforward_layernorm.weight, model_config.rms_norm_eps_
+            )
+            model.layers_.append(decoder)
+        return model

c2cite/models/modeling_llama.py ADDED Viewed

	@@ -0,0 +1,579 @@

+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.models.llama import modeling_llama
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
+from transformers.utils import is_flash_attn_2_available
+from moe_peft.common import (
+    ROPE_INIT_FUNCTIONS,
+    FeedForward,
+    Linear,
+    LLMAttention,
+    LLMCache,
+    LLMDecoder,
+    LLMFeedForward,
+    LLMForCausalLM,
+    LLMModelConfig,
+    LLMModelInput,
+    collect_plugin_router_logtis,
+    eager_attention_forward,
+    flash_attention_forward,
+    prepare_4d_causal_attention_mask,
+    slice_tensor,
+)
+from moe_peft.executors import executor
+from moe_peft.utils import copy_parameters
+@dataclass
+class LlamaConfig(LLMModelConfig):
+    rms_norm_eps_: float = 1e-6
+    rope_scaling_: Optional[Dict[str, Any]] = None
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        config: Optional[LlamaConfig],
+        scaling_factor=1.0,
+        rope_type="default",
+    ):
+        super().__init__()
+        self.rope_kwargs = {
+            "rope_type": rope_type,
+            "factor": scaling_factor,
+            "dim": config.head_dim_,
+            "base": config.rope_theta_,
+            "max_position_embeddings": config.max_seq_len_,
+        }
+        if config is None:
+            self.rope_type = rope_type
+            self.max_seq_len_cached = config.max_seq_len_
+            self.original_max_seq_len = config.max_seq_len_
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling_ is not None:
+                self.rope_type = config.rope_scaling_.get(
+                    "rope_type", config.rope_scaling_.get("type")
+                )
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_seq_len_
+            self.original_max_seq_len = config.max_seq_len_
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(
+            self.config, config.device_, **self.rope_kwargs
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer(
+                "inv_freq", inv_freq, persistent=False
+            )  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if (
+            seq_len < self.original_max_seq_len
+            and self.max_seq_len_cached > self.original_max_seq_len
+        ):  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = (
+            device_type
+            if isinstance(device_type, str) and device_type != "mps"
+            else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (
+                inv_freq_expanded.float() @ position_ids_expanded.float()
+            ).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Multi-headed attention from 'Attention Is All You Need' paper.
+class LlamaAttention(LLMAttention):
+    def __init__(
+        self,
+        wq: nn.Module,
+        wk: nn.Module,
+        wv: nn.Module,
+        wo: nn.Module,
+        idx: int,
+        args: LlamaConfig,
+    ):
+        super().__init__()
+        # attention
+        self.wq_: Linear = Linear(wq, args.device_)  # dim * dim
+        self.wk_: Linear = Linear(wk, args.device_)  # dim * dim
+        self.wv_: Linear = Linear(wv, args.device_)  # dim * dim
+        self.wo_: Linear = Linear(wo, args.device_)  # dim * dim
+        # config
+        self.layer_idx_ = idx
+        self.dim_ = args.dim_
+        self.n_heads_ = args.n_heads_
+        self.n_kv_heads_ = args.n_kv_heads_
+        self.n_rep_ = self.n_heads_ // self.n_kv_heads_
+        self.head_dim_ = args.head_dim_
+        self.dtype_ = args.dtype_
+        self.is_causal_ = True
+    def state_dict(self) -> Dict[str, Linear]:
+        return {
+            "q_proj": self.wq_,
+            "k_proj": self.wk_,
+            "v_proj": self.wv_,
+            "o_proj": self.wo_,
+        }
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        batch_size, max_seq_len, _ = hidden_states.shape
+        xq = self.wq_.forward(hidden_states, input_args)
+        xk = self.wk_.forward(hidden_states, input_args)
+        xv = self.wv_.forward(hidden_states, input_args)
+        # conver shape to multi head
+        xq = xq.view(batch_size, max_seq_len, self.n_heads_, self.head_dim_).transpose(
+            1, 2
+        )
+        xk = xk.view(
+            batch_size, max_seq_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        xv = xv.view(
+            batch_size, max_seq_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        # apply rotary embedding
+        cos, sin = rotary_emb
+        xq, xk = apply_rotary_pos_emb(xq, xk, cos, sin)
+        if past_key_value is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position,
+            }
+            xk, xv = past_key_value.update(xk, xv, self.layer_idx_, cache_kwargs)
+        # for llama2 need to repeat the heads
+        # before dim: batch_size, n_kv_head, seq_len, head_dim
+        # after dim: batch_size, n_head, seq_len, head_dim
+        xk = repeat_kv(xk, self.n_rep_)
+        xv = repeat_kv(xv, self.n_rep_)
+        attention_score, attention_matrix = eager_attention_forward(xq, xk, xv, attention_mask)
+        attention_score = attention_score.reshape(batch_size, max_seq_len, -1)
+        # get output attention score
+        return self.wo_.forward(attention_score, input_args), attention_matrix
+class LlamaFlashAttention(LlamaAttention):
+    def __init__(
+        self,
+        wq: nn.Module,
+        wk: nn.Module,
+        wv: nn.Module,
+        wo: nn.Module,
+        idx: int,
+        args: LlamaConfig,
+    ):
+        assert is_flash_attn_2_available(), "Flash Attention is not available"
+        super().__init__(wq, wk, wv, wo, idx, args)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        batch_size, max_seq_len, _ = hidden_states.shape
+        xq = self.wq_.forward(hidden_states, input_args)
+        xk = self.wk_.forward(hidden_states, input_args)
+        xv = self.wv_.forward(hidden_states, input_args)
+        # conver shape to multi head
+        xq = xq.view(batch_size, max_seq_len, self.n_heads_, self.head_dim_).transpose(
+            1, 2
+        )
+        xk = xk.view(
+            batch_size, max_seq_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        xv = xv.view(
+            batch_size, max_seq_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        # apply rotary embedding
+        cos, sin = rotary_emb
+        xq, xk = apply_rotary_pos_emb(xq, xk, cos, sin)
+        if past_key_value is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position,
+            }
+            xk, xv = past_key_value.update(xk, xv, self.layer_idx_, cache_kwargs)
+        xq = xq.transpose(1, 2)
+        xk = xk.transpose(1, 2)
+        xv = xv.transpose(1, 2)
+        input_dtype = xq.dtype
+        if input_dtype == torch.float32:
+            if executor.is_bf16_supported():
+                target_dtype = torch.bfloat16
+            else:
+                target_dtype = torch.float16
+            xq = xq.to(target_dtype)
+            xk = xk.to(target_dtype)
+            xv = xv.to(target_dtype)
+        attn_output = flash_attention_forward(
+            xq,
+            xk,
+            xv,
+            attention_mask,
+            max_seq_len,
+            is_causal=self.is_causal_,
+        ).to(input_dtype)
+        attn_output = attn_output.reshape(batch_size, max_seq_len, -1).contiguous()
+        attn_output = self.wo_.forward(attn_output, input_args)
+        return attn_output
+LLAMA_ATTENTION_CLASSES = {
+    "eager": LlamaAttention,
+    "flash_attn": LlamaFlashAttention,
+}
+class LlamaMLP(LLMFeedForward):
+    def __init__(
+        self, w1: nn.Module, w2: nn.Module, w3: nn.Module, args: LlamaConfig
+    ) -> None:
+        super().__init__()
+        # feed forward
+        self.w1_: Linear = Linear(w1, args.device_)
+        self.w2_: Linear = Linear(w2, args.device_)
+        self.w3_: Linear = Linear(w3, args.device_)
+        self.act_ = ACT2FN[args.hidden_act_]
+    def state_dict(self) -> Dict[str, nn.Module]:
+        return {
+            "gate_proj": self.w1_,
+            "down_proj": self.w2_,
+            "up_proj": self.w3_,
+        }
+    def _batch_forward(
+        self, data: torch.Tensor, input_args: LLMModelInput
+    ) -> torch.Tensor:
+        w1 = self.w1_.forward(data, input_args)
+        w3 = self.w3_.forward(data, input_args)
+        return self.w2_.forward(self.act_(w1) * w3, input_args)
+    def _lora_forward(
+        self, lora_name: str, act_fn: nn.Module, data: torch.Tensor
+    ) -> torch.Tensor:
+        # Applying LoRA weights to FFN weights
+        if lora_name in self.w1_.loras_:
+            w1 = self.w1_.loras_[lora_name].forward(
+                self.w1_.base_layer_.forward(data), data
+            )
+        else:
+            w1 = self.w1_.base_layer_.forward(data)
+        if lora_name in self.w3_.loras_:
+            w3 = self.w3_.loras_[lora_name].forward(
+                self.w3_.base_layer_.forward(data), data
+            )
+        else:
+            w3 = self.w3_.base_layer_.forward(data)
+        act_result = act_fn(w1) * w3
+        if lora_name in self.w2_.loras_:
+            return self.w2_.loras_[lora_name].forward(
+                self.w2_.base_layer_.forward(act_result), act_result
+            )
+        else:
+            return self.w2_.base_layer_.forward(act_result)
+    def _mixlora_forward(
+        self, moe_name, act_fn, expert_mask, hidden_states, input_dtype
+    ):
+        common_w1 = self.w1_.base_layer_.forward(hidden_states.to(input_dtype)).to(
+            hidden_states.dtype
+        )
+        common_w3 = self.w3_.base_layer_.forward(hidden_states.to(input_dtype)).to(
+            hidden_states.dtype
+        )
+        final_expert_states = []
+        for expert_idx in range(expert_mask.shape[0]):
+            _, top_x = torch.where(expert_mask[expert_idx])
+            lora_name = f"moe.{moe_name}.experts.{expert_idx}"
+            if lora_name in self.w1_.loras_:
+                lora_data = slice_tensor(hidden_states, top_x, input_dtype)
+                w1 = self.w1_.loras_[lora_name].forward(
+                    slice_tensor(common_w1, top_x, input_dtype), lora_data
+                )
+            else:
+                lora_data = None
+                w1 = slice_tensor(common_w1, top_x, input_dtype)
+            if lora_name in self.w3_.loras_:
+                w3 = self.w3_.loras_[lora_name].forward(
+                    slice_tensor(common_w3, top_x, input_dtype),
+                    slice_tensor(hidden_states, top_x, input_dtype, lora_data),
+                )
+            else:
+                w3 = slice_tensor(common_w3, top_x, input_dtype)
+            act_result = act_fn(w1) * w3
+            if lora_name in self.w2_.loras_:
+                final_expert_states.append(
+                    self.w2_.loras_[lora_name].forward(
+                        self.w2_.base_layer_.forward(act_result), act_result
+                    )
+                )
+            else:
+                final_expert_states.append(self.w2_.base_layer_.forward(act_result))
+        return final_expert_states
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, weight: torch.Tensor, eps: float = 1e-6):
+        super().__init__()
+        self.norm_eps_ = eps
+        self.weight_ = weight
+    def forward(self, data: torch.Tensor) -> torch.Tensor:
+        input_dtype = data.dtype
+        v = data.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        data = data * torch.rsqrt(v + self.norm_eps_)
+        return (self.weight_ * data).to(input_dtype)
+class LlamaDecoderLayer(LLMDecoder):
+    def __init__(self, layer_id: int) -> None:
+        super().__init__()
+        self.layer_id_: int = layer_id
+        self.self_attn_: LlamaAttention = None
+        self.mlp_: FeedForward = None
+        self.input_layernorm_: LlamaRMSNorm = None
+        self.post_attention_layernorm_: LlamaRMSNorm = None
+    def state_dict(self) -> Tuple[Dict[str, nn.Module], Dict[str, nn.Module]]:
+        return self.self_attn_.state_dict(), self.mlp_.state_dict()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm_(hidden_states)
+        # Self Attention
+        hidden_states, attention_matrix = self.self_attn_.forward(
+            hidden_states,
+            input_args,
+            rotary_emb,
+            attention_mask,
+            cache_position,
+            past_key_value,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm_(hidden_states)
+        hidden_states = self.mlp_.forward(hidden_states, input_args)
+        hidden_states = residual + hidden_states
+        return hidden_states, attention_matrix
+class LlamaEmbedding(nn.Module):
+    def __init__(self, embedding: torch.Tensor, pad_token: int):
+        super().__init__()
+        self.token_embedding_: torch.Tensor = embedding
+        self.padding_idx_: int = pad_token
+    def forward(self, tokens: torch.Tensor) -> torch.Tensor:
+        data = F.embedding(tokens, self.token_embedding_, padding_idx=self.padding_idx_)
+        return data
+class LlamaForCausalLM(LLMForCausalLM):
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.config_ = config
+        self.padding_idx_ = config.pad_token_id_
+        self.vocab_size_ = config.vocab_size_
+        self.embed_tokens_: LlamaEmbedding = None
+        self.norm_: LlamaRMSNorm = None
+        self.rotary_emb_ = LlamaRotaryEmbedding(config)
+        self.lm_head_ = nn.Linear(
+            config.dim_,
+            config.vocab_size_,
+            bias=False,
+            dtype=config.dtype_,
+            device=config.device_,
+        )
+        self.layers_: List[LlamaDecoderLayer] = []
+    def embed_tokens(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens_(input_ids)
+    def rotary_embed(
+        self, input_tensor: torch.Tensor, position_ids: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.rotary_emb_(input_tensor, position_ids)
+    def decoder_stack(self) -> List[LLMDecoder]:
+        return self.layers_
+    def norm(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.norm_(hidden_states)
+    def causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Optional[LLMCache],
+    ) -> torch.Tensor:
+        return prepare_4d_causal_attention_mask(
+            attention_mask,
+            input_tensor,
+            cache_position,
+            past_key_values,
+        )
+    def model_config(self) -> LlamaConfig:
+        return self.config_
+    @staticmethod
+    def from_pretrained(
+        llm_model: modeling_llama.LlamaForCausalLM,
+        attn_impl: str = "eager",
+        use_sliding_window: bool = False,
+        device: str = executor.default_device_name(),
+    ):
+        assert not use_sliding_window, "Llama model does not support SWA."
+        llm_config: modeling_llama.LlamaConfig = llm_model.config
+        llm_args = LlamaConfig(
+            name_or_path_=llm_config.name_or_path,
+            vocab_size_=llm_config.vocab_size,
+            dim_=llm_config.hidden_size,
+            head_dim_=llm_config.hidden_size // llm_config.num_attention_heads,
+            intermediate_=llm_config.intermediate_size,
+            n_layers_=llm_config.num_hidden_layers,
+            n_heads_=llm_config.num_attention_heads,
+            n_kv_heads_=llm_config.num_key_value_heads,
+            hidden_act_=llm_config.hidden_act,
+            rms_norm_eps_=llm_config.rms_norm_eps,
+            max_seq_len_=llm_config.max_position_embeddings,
+            rope_theta_=llm_config.rope_theta,
+            rope_scaling_=llm_config.rope_scaling,
+            pad_token_id_=llm_config.pad_token_id,
+            attn_implementation_=attn_impl,
+            device_=torch.device(device),
+            dtype_=llm_model.dtype,
+        )
+        if llm_args.pad_token_id_ is None:
+            llm_args.pad_token_id_ = -1
+        model = LlamaForCausalLM(llm_args)
+        llm_model.requires_grad_(False)
+        model.embed_tokens_ = LlamaEmbedding(
+            llm_model.model.embed_tokens.weight, llm_args.pad_token_id_
+        )
+        model.norm_ = LlamaRMSNorm(llm_model.model.norm.weight, llm_args.rms_norm_eps_)
+        copy_parameters(llm_model.lm_head, model.lm_head_)
+        for idx, layer in enumerate(llm_model.model.layers):
+            decoder = LlamaDecoderLayer(idx)
+            decoder.self_attn_ = LLAMA_ATTENTION_CLASSES[llm_args.attn_implementation_](
+                layer.self_attn.q_proj,
+                layer.self_attn.k_proj,
+                layer.self_attn.v_proj,
+                layer.self_attn.o_proj,
+                idx,
+                llm_args,
+            )
+            decoder.mlp_ = FeedForward(
+                LlamaMLP(
+                    layer.mlp.gate_proj,
+                    layer.mlp.down_proj,
+                    layer.mlp.up_proj,
+                    llm_args,
+                )
+            )
+            decoder.input_layernorm_ = LlamaRMSNorm(
+                layer.input_layernorm.weight, llm_args.rms_norm_eps_
+            )
+            decoder.post_attention_layernorm_ = LlamaRMSNorm(
+                layer.post_attention_layernorm.weight, llm_args.rms_norm_eps_
+            )
+            model.layers_.append(decoder)
+        return model

c2cite/models/modeling_mistral.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+from transformers.models.mistral import modeling_mistral
+from transformers.models.qwen2 import modeling_qwen2
+from transformers.utils import is_flash_attn_2_available
+from moe_peft.common import (
+    FeedForward,
+    LLMCache,
+    LLMModelInput,
+    flash_attention_forward,
+)
+from moe_peft.executors import executor
+from moe_peft.models.modeling_llama import (
+    LlamaAttention,
+    LlamaConfig,
+    LlamaDecoderLayer,
+    LlamaEmbedding,
+    LlamaForCausalLM,
+    LlamaMLP,
+    LlamaRMSNorm,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
+from moe_peft.utils import copy_parameters
+@dataclass
+class MistralConfig(LlamaConfig):
+    use_sliding_window_: bool = False
+    max_window_layers_: int = None
+    sliding_window_: int = None
+class MistralFlashAttention(LlamaAttention):
+    def __init__(
+        self,
+        wq: nn.Module,
+        wk: nn.Module,
+        wv: nn.Module,
+        wo: nn.Module,
+        idx: int,
+        args: MistralConfig,
+    ):
+        assert is_flash_attn_2_available(), "Flash Attention is not available"
+        super().__init__(wq, wk, wv, wo, idx, args)
+        # Qwen2
+        self.use_sliding_window_ = args.use_sliding_window_
+        self.max_window_layers_ = args.max_window_layers_
+        # Mistral and Qwen2
+        self.sliding_window_ = args.sliding_window_
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        batch_size, max_seq_len, _ = hidden_states.shape
+        xq = self.wq_.forward(hidden_states, input_args)
+        xk = self.wk_.forward(hidden_states, input_args)
+        xv = self.wv_.forward(hidden_states, input_args)
+        # conver shape to multi head
+        xq = xq.view(batch_size, max_seq_len, self.n_heads_, self.head_dim_).transpose(
+            1, 2
+        )
+        xk = xk.view(
+            batch_size, max_seq_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        xv = xv.view(
+            batch_size, max_seq_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        kv_seq_len = xk.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += cache_position[0]
+        # apply rotary embedding
+        cos, sin = rotary_emb
+        xq, xk = apply_rotary_pos_emb(xq, xk, cos, sin)
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx_) > 0
+            if (
+                self.sliding_window_ is not None
+                and kv_seq_len > self.sliding_window_
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.sliding_window_
+                past_key = past_key_value[self.layer_idx_][0]
+                past_value = past_key_value[self.layer_idx_][1]
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+                if past_key.shape[-2] != self.sliding_window_ - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.sliding_window - 1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat(
+                        [attention_mask, torch.ones_like(attention_mask[:, -1:])],
+                        dim=-1,
+                    )
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position,
+            }  # Specific to RoPE models
+            xk, xv = past_key_value.update(xk, xv, self.layer_idx_, cache_kwargs)
+        xk = repeat_kv(xk, self.n_rep_)
+        xv = repeat_kv(xv, self.n_rep_)
+        input_dtype = xq.dtype
+        if input_dtype == torch.float32:
+            if executor.is_bf16_supported():
+                target_dtype = torch.bfloat16
+            else:
+                target_dtype = torch.float16
+            xq = xq.to(target_dtype)
+            xk = xk.to(target_dtype)
+            xv = xv.to(target_dtype)
+        xq = xq.transpose(1, 2)
+        xk = xk.transpose(1, 2)
+        xv = xv.transpose(1, 2)
+        if (
+            (self.use_sliding_window_ is None or self.use_sliding_window_)
+            and self.sliding_window_ is not None
+            and (
+                self.max_window_layers_ is None
+                or self.layer_idx_ >= self.max_window_layers_
+            )
+        ):
+            sliding_window = self.sliding_window_
+        else:
+            sliding_window = None
+        attn_output = flash_attention_forward(
+            xq,
+            xk,
+            xv,
+            attention_mask,
+            max_seq_len,
+            is_causal=self.is_causal_,
+            sliding_window=sliding_window,
+        ).to(input_dtype)
+        attn_output = attn_output.reshape(
+            batch_size, max_seq_len, self.dim_
+        ).contiguous()
+        attn_output = self.wo_.forward(attn_output, input_args)
+        return attn_output
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": LlamaAttention,
+    "flash_attn": MistralFlashAttention,
+}
+class MistralForCausalLM(LlamaForCausalLM):
+    def __init__(self, config: MistralConfig) -> None:
+        super().__init__(config)
+    @staticmethod
+    def from_pretrained(
+        llm_model: modeling_mistral.MistralForCausalLM,
+        attn_impl: str = "eager",
+        use_sliding_window: bool = False,
+        device: str = executor.default_device_name(),
+    ):
+        llm_config: modeling_mistral.MistralConfig = llm_model.config
+        llm_args = MistralConfig(
+            name_or_path_=llm_config.name_or_path,
+            vocab_size_=llm_config.vocab_size,
+            dim_=llm_config.hidden_size,
+            head_dim_=llm_config.hidden_size // llm_config.num_attention_heads,
+            intermediate_=llm_config.intermediate_size,
+            n_layers_=llm_config.num_hidden_layers,
+            n_heads_=llm_config.num_attention_heads,
+            n_kv_heads_=llm_config.num_key_value_heads,
+            hidden_act_=llm_config.hidden_act,
+            rms_norm_eps_=llm_config.rms_norm_eps,
+            max_seq_len_=llm_config.max_position_embeddings,
+            rope_theta_=llm_config.rope_theta,
+            pad_token_id_=llm_config.pad_token_id,
+            attn_implementation_=attn_impl,
+            use_sliding_window_=use_sliding_window,
+            sliding_window_=llm_config.sliding_window,
+            device_=torch.device(device),
+            dtype_=llm_model.dtype,
+        )
+        # compatible with qwen2
+        if isinstance(llm_config, modeling_qwen2.Qwen2Config):
+            llm_args.max_window_layers_ = llm_config.max_window_layers
+        if llm_args.pad_token_id_ is None:
+            llm_args.pad_token_id_ = -1
+        model = MistralForCausalLM(llm_args)
+        llm_model.requires_grad_(False)
+        model.embed_tokens_ = LlamaEmbedding(
+            llm_model.model.embed_tokens.weight, llm_args.pad_token_id_
+        )
+        model.norm_ = LlamaRMSNorm(llm_model.model.norm.weight, llm_args.rms_norm_eps_)
+        copy_parameters(llm_model.lm_head, model.lm_head_)
+        for idx, layer in enumerate(llm_model.model.layers):
+            decoder = LlamaDecoderLayer(idx)
+            decoder.self_attn_ = MISTRAL_ATTENTION_CLASSES[
+                llm_args.attn_implementation_
+            ](
+                layer.self_attn.q_proj,
+                layer.self_attn.k_proj,
+                layer.self_attn.v_proj,
+                layer.self_attn.o_proj,
+                idx,
+                llm_args,
+            )
+            decoder.mlp_ = FeedForward(
+                LlamaMLP(
+                    layer.mlp.gate_proj,
+                    layer.mlp.down_proj,
+                    layer.mlp.up_proj,
+                    llm_args,
+                )
+            )
+            decoder.input_layernorm_ = LlamaRMSNorm(
+                layer.input_layernorm.weight, llm_args.rms_norm_eps_
+            )
+            decoder.post_attention_layernorm_ = LlamaRMSNorm(
+                layer.post_attention_layernorm.weight, llm_args.rms_norm_eps_
+            )
+            model.layers_.append(decoder)
+        return model

c2cite/models/modeling_phi.py ADDED Viewed

	@@ -0,0 +1,576 @@

+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.models.phi import modeling_phi
+from transformers.models.phi.modeling_phi import (
+    PhiRotaryEmbedding,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
+from transformers.utils import is_flash_attn_2_available
+from moe_peft.common import (
+    FeedForward,
+    Linear,
+    LLMAttention,
+    LLMCache,
+    LLMDecoder,
+    LLMFeedForward,
+    LLMForCausalLM,
+    LLMModelConfig,
+    LLMModelInput,
+    collect_plugin_router_logtis,
+    eager_attention_forward,
+    flash_attention_forward,
+    prepare_4d_causal_attention_mask,
+    slice_tensor,
+)
+from moe_peft.executors import executor
+from moe_peft.utils import copy_parameters
+@dataclass
+class PhiConfig(LLMModelConfig):
+    layer_norm_eps_: float = 1e-05
+    resid_pdrop_: float = 0.0
+    embd_pdrop_: float = 0.0
+    rotary_emb_dim_: int = 0
+    qk_layernorm_: bool = False
+def apply_partial_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    rotary_emb_dim: int,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    position_ids: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    q_rot, q_pass = (
+        xq[..., :rotary_emb_dim],
+        xq[..., rotary_emb_dim:],
+    )
+    k_rot, k_pass = (
+        xk[..., :rotary_emb_dim],
+        xk[..., rotary_emb_dim:],
+    )
+    # [batch_size, seq_length, num_heads, head_dim // partial_rotary_factor]
+    q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin, position_ids)
+    # [batch_size, seq_length, num_heads, head_dim]
+    xq = torch.cat((q_rot, q_pass), dim=-1)
+    xk = torch.cat((k_rot, k_pass), dim=-1)
+    return xq, xk
+# Multi-headed attention from 'Attention Is All You Need' paper.
+class PhiAttention(LLMAttention):
+    def __init__(
+        self,
+        q_proj: nn.Module,
+        k_proj: nn.Module,
+        v_proj: nn.Module,
+        dense: nn.Module,
+        idx: int,
+        config: PhiConfig,
+    ):
+        super().__init__()
+        # attention
+        self.wq_: Linear = Linear(q_proj, config.device_)
+        self.wk_: Linear = Linear(k_proj, config.device_)
+        self.wv_: Linear = Linear(v_proj, config.device_)
+        self.dense_: Linear = Linear(dense, config.device_)
+        # config
+        self.layer_idx_ = idx
+        self.dim_ = config.dim_
+        self.n_heads_ = config.n_heads_
+        self.n_kv_heads_ = config.n_kv_heads_
+        self.n_rep_ = self.n_heads_ // self.n_kv_heads_
+        self.rotary_emb_dim_ = config.rotary_emb_dim_
+        self.head_dim_ = config.head_dim_
+        self.dtype_ = config.dtype_
+        self.is_causal_ = True
+        # qk norm
+        self.qk_layernorm_: bool = config.qk_layernorm_
+        if self.qk_layernorm_:
+            self.q_layernorm_ = nn.LayerNorm(
+                self.hidden_size_ // self.num_heads_,
+                eps=config.norm_eps_,
+                elementwise_affine=True,
+            )
+            self.k_layernorm_ = nn.LayerNorm(
+                self.hidden_size_ // self.num_heads_,
+                eps=config.norm_eps_,
+                elementwise_affine=True,
+            )
+        else:
+            self.q_layernorm_ = nn.Identity()
+            self.k_layernorm_ = nn.Identity()
+    def state_dict(self) -> Dict[str, Linear]:
+        return {
+            "q_proj": self.wq_,
+            "k_proj": self.wk_,
+            "v_proj": self.wv_,
+            "dense": self.dense_,
+        }
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        batch_size, max_seq_len, _ = hidden_states.shape
+        xq = self.wq_.forward(hidden_states, input_args)
+        xk = self.wk_.forward(hidden_states, input_args)
+        xv = self.wv_.forward(hidden_states, input_args)
+        xq = self.q_layernorm_(xq)
+        xk = self.k_layernorm_(xk)
+        # conver shape to multi head
+        xq = xq.view(batch_size, max_seq_len, self.n_heads_, self.head_dim_).transpose(
+            1, 2
+        )
+        xk = xk.view(
+            batch_size, max_seq_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        xv = xv.view(
+            batch_size, max_seq_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        cos, sin = rotary_emb
+        # partial rotary embedding
+        xq, xk = apply_partial_rotary_emb(
+            xq,
+            xk,
+            self.rotary_emb_dim_,
+            cos,
+            sin,
+            cache_position.unsqueeze(0),
+        )
+        if past_key_value is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "partial_rotation_size": self.rotary_emb_dim_,
+                "cache_position": cache_position,
+            }
+            xk, xv = past_key_value.update(xk, xv, self.layer_idx_, cache_kwargs)
+        # before dim: batch_size, n_kv_head, seq_len, head_dim
+        # after dim: batch_size, n_head, seq_len, head_dim
+        xk = repeat_kv(xk, self.n_rep_)
+        xv = repeat_kv(xv, self.n_rep_)
+        attention_score = eager_attention_forward(
+            xq.to(torch.float32), xk.to(torch.float32), xv, attention_mask
+        )
+        attention_score = attention_score.reshape(batch_size, max_seq_len, -1)
+        attention_score = self.dense_.forward(attention_score, input_args)
+        return attention_score
+class PhiFlashAttention2(PhiAttention):
+    def __init__(
+        self,
+        q_proj: nn.Module,
+        k_proj: nn.Module,
+        v_proj: nn.Module,
+        dense: nn.Module,
+        idx: int,
+        args: PhiConfig,
+    ):
+        assert is_flash_attn_2_available(), "Flash Attention is not available"
+        super().__init__(q_proj, k_proj, v_proj, dense, idx, args)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        batch_size, max_seq_len, _ = hidden_states.shape
+        xq = self.wq_.forward(hidden_states, input_args)
+        xk = self.wk_.forward(hidden_states, input_args)
+        xv = self.wv_.forward(hidden_states, input_args)
+        xq = self.q_layernorm_(xq)
+        xk = self.k_layernorm_(xk)
+        # conver shape to multi head
+        xq = xq.view(batch_size, max_seq_len, self.n_heads_, self.head_dim_).transpose(
+            1, 2
+        )
+        xk = xk.view(
+            batch_size, max_seq_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        xv = xv.view(
+            batch_size, max_seq_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        cos, sin = rotary_emb
+        # partial rotary embedding
+        xq, xk = apply_partial_rotary_emb(
+            xq,
+            xk,
+            self.rotary_emb_dim_,
+            cos,
+            sin,
+            cache_position.unsqueeze(0),
+        )
+        if past_key_value is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "partial_rotation_size": self.rotary_emb_dim_,
+                "cache_position": cache_position,
+            }
+            xk, xv = past_key_value.update(xk, xv, self.layer_idx_, cache_kwargs)
+        xq = xq.transpose(1, 2)
+        xk = xk.transpose(1, 2)
+        xv = xv.transpose(1, 2)
+        input_dtype = xq.dtype
+        if input_dtype == torch.float32:
+            if executor.is_bf16_supported():
+                target_dtype = torch.bfloat16
+            else:
+                target_dtype = torch.float16
+            xq = xq.to(target_dtype)
+            xk = xk.to(target_dtype)
+            xv = xv.to(target_dtype)
+        attn_output = flash_attention_forward(
+            xq,
+            xk,
+            xv,
+            attention_mask,
+            max_seq_len,
+            is_causal=self.is_causal_,
+        ).to(input_dtype)
+        attn_output = attn_output.reshape(
+            batch_size, max_seq_len, self.dim_
+        ).contiguous()
+        attn_output = self.dense_.forward(attn_output, input_args)
+        return attn_output
+PHI_ATTENTION_CLASSES = {
+    "eager": PhiAttention,
+    "flash_attn": PhiFlashAttention2,
+}
+class PhiMLP(LLMFeedForward):
+    def __init__(self, fc1: nn.Module, fc2: nn.Module, args: PhiConfig) -> None:
+        super().__init__()
+        # feed forward
+        self.fc1_: Linear = Linear(fc1, args.device_)
+        self.fc2_: Linear = Linear(fc2, args.device_)
+        self.act_ = ACT2FN[args.hidden_act_]
+    def state_dict(self) -> Dict[str, nn.Module]:
+        return {
+            "fc1": self.fc1_,
+            "fc2": self.fc2_,
+        }
+    def _batch_forward(
+        self, hidden_states: torch.Tensor, input_args: LLMModelInput
+    ) -> torch.Tensor:
+        hidden_states = self.fc1_.forward(hidden_states, input_args)
+        hidden_states = self.act_(hidden_states)
+        hidden_states = self.fc2_.forward(hidden_states, input_args)
+        return hidden_states
+    def _lora_forward(
+        self, lora_name: str, act_fn: nn.Module, hidden_states: torch.Tensor
+    ) -> torch.Tensor:
+        if lora_name in self.fc1_.loras_:
+            hidden_states = self.fc1_.loras_[lora_name].forward(
+                self.fc1_.base_layer_.forward(hidden_states), hidden_states
+            )
+        else:
+            hidden_states = self.fc1_.base_layer_.forward(hidden_states)
+        hidden_states = act_fn(hidden_states)
+        if lora_name in self.fc2_.loras_:
+            hidden_states = self.fc2_.loras_[lora_name].forward(
+                self.fc2_.base_layer_.forward(hidden_states), hidden_states
+            )
+        else:
+            hidden_states = self.fc2_.base_layer_.forward(hidden_states)
+        return hidden_states
+    def _mixlora_forward(
+        self, moe_name, act_fn, expert_mask, hidden_states, input_dtype
+    ):
+        common_fc1 = self.fc1_.base_layer_.forward(hidden_states.to(input_dtype)).to(
+            hidden_states.dtype
+        )
+        final_expert_states = []
+        for expert_idx in range(expert_mask.shape[0]):
+            _, top_x = torch.where(expert_mask[expert_idx])
+            lora_name = f"moe.{moe_name}.experts.{expert_idx}"
+            if lora_name in self.fc1_.loras_:
+                lora_data = slice_tensor(hidden_states, top_x, input_dtype)
+                act_result = act_fn(
+                    self.fc1_.loras_[lora_name].forward(
+                        slice_tensor(common_fc1, top_x, input_dtype), lora_data
+                    )
+                )
+            else:
+                act_result = act_fn(slice_tensor(common_fc1, top_x, input_dtype))
+            if lora_name in self.fc2_.loras_:
+                final_expert_states.append(
+                    self.fc2_.loras_[lora_name].forward(
+                        self.fc2_.base_layer_.forward(act_result), act_result
+                    )
+                )
+            else:
+                final_expert_states.append(self.fc2_.base_layer_.forward(act_result))
+        return final_expert_states
+class PhiDecoderLayer(LLMDecoder):
+    def __init__(
+        self, layer_id: int, self_attn: LLMAttention, mlp: FeedForward, args: PhiConfig
+    ) -> None:
+        super().__init__()
+        self.layer_id_: int = layer_id
+        self.self_attn_ = self_attn
+        self.mlp_ = mlp
+        self.input_layernorm_ = nn.LayerNorm(
+            args.dim_, eps=args.layer_norm_eps_, dtype=args.dtype_, device=args.device_
+        )
+        self.resid_pdrop_ = args.resid_pdrop_
+    def state_dict(self) -> Tuple[Dict[str, nn.Module], Dict[str, nn.Module]]:
+        return self.self_attn_.state_dict(), self.mlp_.state_dict()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm_(hidden_states)
+        # Self Attention
+        attn_outputs = self.self_attn_.forward(
+            hidden_states,
+            input_args,
+            rotary_emb,
+            attention_mask,
+            cache_position,
+            past_key_value,
+        )
+        attn_outputs = F.dropout(
+            attn_outputs, self.resid_pdrop_, not input_args.inference_mode_
+        )
+        # Fully Connected
+        feed_forward_outputs, router_logits = self.mlp_.forward(
+            hidden_states, input_args
+        )
+        feed_forward_outputs = F.dropout(
+            feed_forward_outputs, self.resid_pdrop_, not input_args.inference_mode_
+        )
+        hidden_states = attn_outputs + feed_forward_outputs + residual
+        if input_args.output_router_logits_:
+            router_logits = collect_plugin_router_logtis(
+                router_logits, input_args, self
+            )
+        return hidden_states, *router_logits
+class PhiEmbedding(nn.Module):
+    def __init__(self, config: PhiConfig):
+        super().__init__()
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size_,
+            config.dim_,
+            config.pad_token_id_,
+            dtype=config.dtype_,
+            device=config.device_,
+        )
+        self.embed_dropout = nn.Dropout(config.embd_pdrop_)
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        inputs_embeds = self.embed_tokens(input_ids)
+        return self.embed_dropout(inputs_embeds)
+class PhiLayerNorm(nn.Module):
+    def __init__(self, config: PhiConfig) -> None:
+        super().__init__()
+        self.layernorm_ = nn.LayerNorm(
+            config.dim_,
+            eps=config.layer_norm_eps_,
+            dtype=config.dtype_,
+            device=config.device_,
+        )
+    def forward(self, data: torch.Tensor) -> torch.Tensor:
+        return self.layernorm_(data)
+class PhiForCausalLM(LLMForCausalLM):
+    def __init__(self, config: PhiConfig) -> None:
+        super().__init__()
+        self.config_ = config
+        self.padding_idx_ = config.pad_token_id_
+        self.vocab_size_ = config.vocab_size_
+        self.embed_tokens_ = PhiEmbedding(config)
+        self.final_layernorm_ = PhiLayerNorm(config)
+        self.rotary_emb_ = PhiRotaryEmbedding(
+            dim=config.rotary_emb_dim_,
+            max_position_embeddings=config.max_seq_len_,
+            base=config.rope_theta_,
+            device=config.device_,
+        )
+        self.lm_head_ = nn.Linear(
+            config.dim_,
+            config.vocab_size_,
+            bias=True,
+            dtype=config.dtype_,
+            device=config.device_,
+        )
+        self.layers_: List[PhiDecoderLayer] = []
+    def embed_tokens(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens_(input_ids)
+    def rotary_embed(
+        self, input_tensor: torch.Tensor, position_ids: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.rotary_emb_(input_tensor, seq_len=position_ids[-1, -1] + 1)
+    def decoder_stack(self) -> List[LLMDecoder]:
+        return self.layers_
+    def norm(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.final_layernorm_(hidden_states)
+    def causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Optional[LLMCache],
+    ) -> torch.Tensor:
+        return prepare_4d_causal_attention_mask(
+            attention_mask,
+            input_tensor,
+            cache_position,
+            past_key_values,
+        )
+    def model_config(self) -> PhiConfig:
+        return self.config_
+    @staticmethod
+    def from_pretrained(
+        llm_model: modeling_phi.PhiForCausalLM,
+        attn_impl: str = "eager",
+        use_sliding_window: bool = False,
+        device: str = executor.default_device_name(),
+    ):
+        assert not use_sliding_window, "Phi model does not support SWA."
+        llm_config: modeling_phi.PhiConfig = llm_model.config
+        llm_args = PhiConfig(
+            name_or_path_=llm_config.name_or_path,
+            vocab_size_=llm_config.vocab_size,
+            dim_=llm_config.hidden_size,
+            head_dim_=llm_config.hidden_size // llm_config.num_attention_heads,
+            intermediate_=llm_config.intermediate_size,
+            n_layers_=llm_config.num_hidden_layers,
+            n_heads_=llm_config.num_attention_heads,
+            n_kv_heads_=llm_config.num_key_value_heads,
+            hidden_act_=llm_config.hidden_act,
+            resid_pdrop_=llm_config.resid_pdrop,
+            embd_pdrop_=llm_config.embd_pdrop,
+            max_seq_len_=llm_config.max_position_embeddings,
+            layer_norm_eps_=llm_config.layer_norm_eps,
+            rope_theta_=llm_config.rope_theta,
+            partial_rotary_factor_=llm_config.partial_rotary_factor,
+            qk_layernorm_=llm_config.qk_layernorm,
+            pad_token_id_=llm_config.pad_token_id,
+            attn_implementation_=attn_impl,
+            device_=torch.device(device),
+            dtype_=llm_model.dtype,
+        )
+        llm_args.rotary_emb_dim_ = int(
+            llm_args.partial_rotary_factor_ * llm_args.head_dim_
+        )
+        if llm_args.pad_token_id_ is None:
+            llm_args.pad_token_id_ = -1
+        model = PhiForCausalLM(llm_args)
+        llm_model.requires_grad_(False)
+        copy_parameters(llm_model.model.embed_tokens, model.embed_tokens_.embed_tokens)
+        copy_parameters(
+            llm_model.model.final_layernorm, model.final_layernorm_.layernorm_
+        )
+        copy_parameters(llm_model.lm_head, model.lm_head_)
+        for idx, layer in enumerate(llm_model.model.layers):
+            decoder = PhiDecoderLayer(
+                idx,
+                PHI_ATTENTION_CLASSES[llm_args.attn_implementation_](
+                    layer.self_attn.q_proj,
+                    layer.self_attn.k_proj,
+                    layer.self_attn.v_proj,
+                    layer.self_attn.dense,
+                    idx,
+                    llm_args,
+                ),
+                FeedForward(
+                    PhiMLP(
+                        layer.mlp.fc1,
+                        layer.mlp.fc2,
+                        llm_args,
+                    )
+                ),
+                llm_args,
+            )
+            copy_parameters(layer.input_layernorm, decoder.input_layernorm_)
+            model.layers_.append(decoder)
+        return model

c2cite/models/modeling_phi3.py ADDED Viewed

	@@ -0,0 +1,581 @@

+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+from transformers.activations import ACT2FN
+from transformers.models.phi3 import modeling_phi3
+from transformers.models.phi3.modeling_phi3 import apply_rotary_pos_emb, repeat_kv
+from transformers.utils import is_flash_attn_2_available
+from moe_peft.common import (
+    FeedForward,
+    Linear,
+    LLMAttention,
+    LLMCache,
+    LLMDecoder,
+    LLMFeedForward,
+    LLMForCausalLM,
+    LLMModelConfig,
+    LLMModelInput,
+    collect_plugin_router_logtis,
+    eager_attention_forward,
+    flash_attention_forward,
+    prepare_4d_causal_attention_mask,
+    slice_tensor,
+)
+from moe_peft.executors import executor
+from moe_peft.utils import copy_parameters
+from .modeling_gemma2 import Gemma2RotaryEmbedding as Phi3RotaryEmbedding
+from .modeling_llama import LlamaEmbedding as Phi3Embedding
+from .modeling_llama import LlamaRMSNorm as Phi3RMSNorm
+@dataclass
+class Phi3Config(LLMModelConfig):
+    rms_norm_eps_: float = 1e-6
+    original_max_position_embeddings_: int = 4096
+    rope_scaling_: Optional[Dict[str, Any]] = None
+    use_sliding_window_: bool = False
+    sliding_window_: int = 4096
+    resid_pdrop_: float = 0.0
+class Phi3LongRoPEScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config: Phi3Config, device=None):
+        super().__init__(dim, config.max_seq_len_, config.rope_theta_, device)
+        self.short_factor = config.rope_scaling_["short_factor"]
+        self.long_factor = config.rope_scaling_["long_factor"]
+        self.original_max_position_embeddings = config.original_max_position_embeddings_
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(
+                self.long_factor, dtype=torch.float32, device=x.device
+            )
+        else:
+            ext_factors = torch.tensor(
+                self.short_factor, dtype=torch.float32, device=x.device
+            )
+        inv_freq_shape = (
+            torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float()
+            / self.dim
+        )
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = (
+            device_type
+            if isinstance(device_type, str) and device_type != "mps"
+            else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (
+                inv_freq_expanded.float() @ position_ids_expanded.float()
+            ).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = math.sqrt(
+                    1
+                    + math.log(scale) / math.log(self.original_max_position_embeddings)
+                )
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi3Attention(LLMAttention):
+    def __init__(
+        self, qkv_proj: nn.Module, o_proj: nn.Module, layer_idx: int, args: Phi3Config
+    ) -> None:
+        super().__init__()
+        # attention
+        self.qkv_proj_ = Linear(qkv_proj, args.device_)
+        self.o_proj_ = Linear(o_proj, args.device_)
+        # config
+        self.layer_idx_ = layer_idx
+        self.args_ = args
+        self.dim_ = args.dim_
+        self.n_heads_ = args.n_heads_
+        self.n_kv_heads_ = args.n_kv_heads_
+        self.n_rep_ = self.n_heads_ // self.n_kv_heads_
+        self.rope_theta_ = args.rope_theta_
+        self.head_dim_ = self.dim_ // self.n_heads_
+        self.dtype_ = args.dtype_
+        self.is_causal_ = True
+    def state_dict(self) -> Dict[str, Linear]:
+        return {
+            "qkv_proj": self.qkv_proj_,
+            "o_proj": self.o_proj_,
+        }
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj_.forward(hidden_states, input_args)
+        query_pos = self.n_heads_ * self.head_dim_
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.n_kv_heads_ * self.head_dim_]
+        value_states = qkv[..., query_pos + self.n_kv_heads_ * self.head_dim_ :]
+        query_states = query_states.view(
+            bsz, q_len, self.n_heads_, self.head_dim_
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        # apply rotary embedding
+        cos, sin = rotary_emb
+        assert query_states.dtype == key_states.dtype
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin, cache_position.unsqueeze(0)
+        )
+        if past_key_value is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position,
+            }
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx_, cache_kwargs
+            )
+        value_states = repeat_kv(value_states, self.n_rep_)
+        key_states = repeat_kv(key_states, self.n_rep_)
+        attn_output = eager_attention_forward(
+            query_states, key_states, value_states, attention_mask
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        return self.o_proj_(attn_output, input_args)
+class Phi3FlashAttention2(Phi3Attention):
+    def __init__(
+        self, qkv_proj: nn.Module, o_proj: nn.Module, layer_idx: int, args: Phi3Config
+    ) -> None:
+        assert is_flash_attn_2_available(), "Flash Attention is not available"
+        super().__init__(qkv_proj, o_proj, layer_idx, args)
+        self.sliding_window_ = args.sliding_window_
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        # cutting
+        qkv = self.qkv_proj_.forward(hidden_states, input_args)
+        query_pos = self.n_heads_ * self.head_dim_
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.n_kv_heads_ * self.head_dim_]
+        value_states = qkv[..., query_pos + self.n_kv_heads_ * self.head_dim_ :]
+        # viewing
+        query_states = query_states.view(
+            bsz, q_len, self.n_heads_, self.head_dim_
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.n_kv_heads_, self.head_dim_
+        ).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += cache_position[0]
+        # apply rotary embedding
+        cos, sin = rotary_emb
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin
+        )
+        # Activate slicing cache
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx_) > 0
+            if (
+                self.sliding_window_ is not None
+                and kv_seq_len > self.sliding_window_
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.sliding_window_
+                past_key = past_key_value[self.layer_idx_][0]
+                past_value = past_key_value[self.layer_idx_][1]
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+                if past_key.shape[-2] != self.sliding_window_ - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.sliding_window - 1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat(
+                        [attention_mask, torch.ones_like(attention_mask[:, -1:])],
+                        dim=-1,
+                    )
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position,
+            }  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx_, cache_kwargs
+            )
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.n_rep_)
+        value_states = repeat_kv(value_states, self.n_rep_)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if executor.is_bf16_supported():
+                target_dtype = torch.bfloat16
+            else:
+                target_dtype = torch.float16
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        attn_output = flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            is_causal=self.is_causal_,
+            sliding_window=self.sliding_window_,
+        ).to(input_dtype)
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj_(attn_output, input_args)
+        return attn_output
+PHI3_ATTENTION_CLASSES = {
+    "eager": Phi3Attention,
+    "flash_attn": Phi3FlashAttention2,
+}
+class Phi3MLP(LLMFeedForward):
+    def __init__(self, gate: nn.Module, down: nn.Module, args: Phi3Config) -> None:
+        super().__init__()
+        # feed forward
+        self.gate_up_proj_ = Linear(gate, args.device_)
+        self.down_proj_ = Linear(down, args.device_)
+        self.act_ = ACT2FN[args.hidden_act_]
+    def state_dict(self) -> Dict[str, nn.Module]:
+        return {
+            "gate_up_proj": self.gate_up_proj_,
+            "down_proj": self.down_proj_,
+        }
+    def _batch_forward(
+        self, hidden_states: torch.Tensor, input_args: LLMModelInput
+    ) -> torch.Tensor:
+        up_states = self.gate_up_proj_(hidden_states, input_args)
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.act_(gate)
+        return self.down_proj_(up_states, input_args)
+    def _lora_forward(
+        self, lora_name: str, act_fn: nn.Module, data: torch.Tensor
+    ) -> torch.Tensor:
+        # Applying LoRA weights to FFN weights
+        if lora_name in self.gate_up_proj_.loras_:
+            gate_up_states = self.gate_up_proj_.loras_[lora_name].forward(
+                self.gate_up_proj_.base_layer_.forward(data), data
+            )
+        else:
+            gate_up_states = self.gate_up_proj_.base_layer_.forward(data)
+        gate_states, up_states = gate_up_states.chunk(2, dim=-1)
+        act_result = act_fn(gate_states) * up_states
+        if lora_name in self.down_proj_.loras_:
+            return self.down_proj_.loras_[lora_name].forward(
+                self.down_proj_.base_layer_.forward(act_result), act_result
+            )
+        else:
+            return self.down_proj_.base_layer_.forward(act_result)
+    def _mixlora_forward(
+        self, moe_name, act_fn, expert_mask, hidden_states, input_dtype
+    ):
+        common_gate_up = self.gate_up_proj_.base_layer_.forward(
+            hidden_states.to(input_dtype)
+        ).to(hidden_states.dtype)
+        final_expert_states = []
+        for expert_idx in range(expert_mask.shape[0]):
+            _, top_x = torch.where(expert_mask[expert_idx])
+            lora_name = f"moe.{moe_name}.experts.{expert_idx}"
+            if lora_name in self.gate_up_proj_.loras_:
+                gate_up_states = self.gate_up_proj_.loras_[lora_name].forward(
+                    slice_tensor(common_gate_up, top_x, input_dtype),
+                    slice_tensor(hidden_states, top_x, input_dtype),
+                )
+            else:
+                gate_up_states = slice_tensor(common_gate_up, top_x, input_dtype)
+            gate_states, up_states = gate_up_states.chunk(2, dim=-1)
+            act_result = up_states * act_fn(gate_states)
+            if lora_name in self.down_proj_.loras_:
+                final_expert_states.append(
+                    self.down_proj_.loras_[lora_name].forward(
+                        self.down_proj_.base_layer_.forward(act_result),
+                        act_result,
+                    )
+                )
+            else:
+                final_expert_states.append(
+                    self.down_proj_.base_layer_.forward(act_result)
+                )
+        return final_expert_states
+class Phi3DecoderLayer(LLMDecoder):
+    def __init__(self, layer_id: int, config: Phi3Config) -> None:
+        super().__init__()
+        self.layer_id_: int = layer_id
+        self.self_attn_: Phi3Attention = None
+        self.mlp_: FeedForward = None
+        self.input_layernorm_: Phi3RMSNorm = None
+        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop_)
+        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop_)
+        self.post_attention_layernorm_: Phi3RMSNorm = None
+    def state_dict(self) -> Tuple[Dict[str, nn.Module], Dict[str, nn.Module]]:
+        return self.self_attn_.state_dict(), self.mlp_.state_dict()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_args: LLMModelInput,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        past_key_value: Optional[LLMCache] = None,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm_(hidden_states)
+        # Self Attention
+        attn_outputs = self.self_attn_.forward(
+            hidden_states,
+            input_args,
+            rotary_emb,
+            attention_mask,
+            cache_position,
+            past_key_value,
+        )
+        hidden_states = residual + self.resid_attn_dropout(attn_outputs)
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm_(hidden_states)
+        hidden_states, router_logits = self.mlp_.forward(hidden_states, input_args)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)
+        if input_args.output_router_logits_:
+            router_logits = collect_plugin_router_logtis(
+                router_logits, input_args, self
+            )
+        return hidden_states, *router_logits
+class Phi3ForCausalLM(LLMForCausalLM):
+    def _init_rope(self):
+        if self.config_.rope_scaling_ is None:
+            return Phi3RotaryEmbedding(
+                self.config_.head_dim_,
+                max_position_embeddings=self.config_.max_seq_len_,
+                base=self.config_.rope_theta_,
+                device=self.config_.device_,
+            )
+        else:
+            scaling_type = self.config_.rope_scaling_["type"]
+            assert scaling_type == "longrope", ValueError(
+                f"Unknown RoPE scaling type {scaling_type}"
+            )
+            return Phi3LongRoPEScaledRotaryEmbedding(
+                self.config_.head_dim_,
+                config=self.config_,
+                device=self.config_.device_,
+            )
+    def __init__(self, config: Phi3Config) -> None:
+        super().__init__()
+        self.config_ = config
+        self.padding_idx_ = config.pad_token_id_
+        self.vocab_size_ = config.vocab_size_
+        self.embed_tokens_: Phi3Embedding = None
+        self.norm_: Phi3Embedding = None
+        self.rotary_emb_ = self._init_rope()
+        self.lm_head_ = nn.Linear(
+            config.dim_,
+            config.vocab_size_,
+            bias=False,
+            dtype=config.dtype_,
+            device=config.device_,
+        )
+        self.layers_: List[Phi3DecoderLayer] = []
+    def embed_tokens(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens_(input_ids)
+    def rotary_embed(
+        self, input_tensor: torch.Tensor, position_ids: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.rotary_emb_(input_tensor, position_ids)
+    def decoder_stack(self) -> List[LLMDecoder]:
+        return self.layers_
+    def norm(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.norm_(hidden_states)
+    def causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Optional[LLMCache],
+    ) -> torch.Tensor:
+        return prepare_4d_causal_attention_mask(
+            attention_mask,
+            input_tensor,
+            cache_position,
+            past_key_values,
+        )
+    def model_config(self) -> Phi3Config:
+        return self.config_
+    @staticmethod
+    def from_pretrained(
+        llm_model: modeling_phi3.Phi3ForCausalLM,
+        attn_impl: str = "eager",
+        use_sliding_window: bool = False,
+        device: str = executor.default_device_name(),
+    ):
+        llm_config: modeling_phi3.Phi3Config = llm_model.config
+        llm_args = Phi3Config(
+            name_or_path_=llm_config.name_or_path,
+            vocab_size_=llm_config.vocab_size,
+            dim_=llm_config.hidden_size,
+            head_dim_=llm_config.hidden_size // llm_config.num_attention_heads,
+            intermediate_=llm_config.intermediate_size,
+            n_layers_=llm_config.num_hidden_layers,
+            n_heads_=llm_config.num_attention_heads,
+            n_kv_heads_=llm_config.num_key_value_heads,
+            hidden_act_=llm_config.hidden_act,
+            rms_norm_eps_=llm_config.rms_norm_eps,
+            resid_pdrop_=llm_config.resid_pdrop,
+            max_seq_len_=llm_config.max_position_embeddings,
+            rope_theta_=llm_config.rope_theta,
+            rope_scaling_=llm_config.rope_scaling,
+            original_max_position_embeddings_=llm_config.original_max_position_embeddings,
+            pad_token_id_=llm_config.pad_token_id,
+            attn_implementation_=attn_impl,
+            use_sliding_window_=use_sliding_window,
+            sliding_window_=llm_config.sliding_window,
+            device_=torch.device(device),
+            dtype_=llm_model.dtype,
+        )
+        if llm_args.pad_token_id_ is None:
+            llm_args.pad_token_id_ = -1
+        model = Phi3ForCausalLM(llm_args)
+        llm_model.requires_grad_(False)
+        model.embed_tokens_ = Phi3Embedding(
+            llm_model.model.embed_tokens.weight, llm_args.pad_token_id_
+        )
+        model.norm_ = Phi3RMSNorm(llm_model.model.norm.weight, llm_args.rms_norm_eps_)
+        copy_parameters(llm_model.lm_head, model.lm_head_)
+        for idx, layer in enumerate(llm_model.model.layers):
+            decoder = Phi3DecoderLayer(idx, llm_args)
+            decoder.self_attn_ = PHI3_ATTENTION_CLASSES[llm_args.attn_implementation_](
+                layer.self_attn.qkv_proj,
+                layer.self_attn.o_proj,
+                idx,
+                llm_args,
+            )
+            decoder.mlp_ = FeedForward(
+                Phi3MLP(
+                    layer.mlp.gate_up_proj,
+                    layer.mlp.down_proj,
+                    llm_args,
+                )
+            )
+            decoder.input_layernorm_ = Phi3RMSNorm(
+                layer.input_layernorm.weight, llm_args.rms_norm_eps_
+            )
+            decoder.post_attention_layernorm_ = Phi3RMSNorm(
+                layer.post_attention_layernorm.weight, llm_args.rms_norm_eps_
+            )
+            model.layers_.append(decoder)
+        return model

c2cite/prompter.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import json
+import logging
+import os.path as osp
+from typing import Dict, Optional, Union
+prompt_templates = {
+    "moe_peft": {
+        "description": "Default Prompt Template Provided by MoE-PEFT",
+        "prompt_input": "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n",
+        "prompt_no_input": "### Instruction:\n{instruction}\n\n### Output:\n",
+        "response_split": "### Output:",
+    },
+    "alpaca": {
+        "description": "Template used by Alpaca-LoRA.",
+        "prompt_input": "Below is an instruction that describes a task, "
+        + "paired with an input that provides further context. "
+        + "Write a response that appropriately completes the request.\n\n"
+        + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
+        "prompt_no_input": "Below is an instruction that describes a task. "
+        + "Write a response that appropriately completes the request.\n\n"
+        + "### Instruction:\n{instruction}\n\n### Response:\n",
+        "response_split": "### Response:",
+    },
+}
+# manage templates and prompt building.
+class Prompter:
+    def __init__(self, template: Optional[Union[Dict, str]] = None):
+        if template is None:
+            self.template = prompt_templates["moe_peft"]
+        elif isinstance(template, str):
+            if osp.exists(template):
+                with open(template) as fp:
+                    self.template = json.load(fp)
+            else:
+                self.template = prompt_templates[template]
+        else:
+            self.template = template
+        logging.info(f"Using prompt template: {self.template['description']}")
+    def generate_prompt(
+        self,
+        instruction: str,
+        input: Union[None, str] = None,
+        label: Union[None, str] = None,
+    ) -> str:
+        # returns the full prompt from instruction and optional input
+        # if a label (=response, =output) is provided, it's also appended.
+        if input:
+            res = self.template["prompt_input"].format(
+                instruction=instruction, input=input
+            )
+        else:
+            res = self.template["prompt_no_input"].format(instruction=instruction)
+        if label:
+            res = f"{res}{label}\n"
+        logging.debug(res)
+        return res
+    def get_response(self, output: str) -> str:
+        return output.split(self.template["response_split"])[-1].strip()

c2cite/solutions.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+# peering那篇
+def get_output(layers, hidden, output, ans_len):
+    if layers == 32:
+        pass
+    else:
+        pass

c2cite/tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from . import glue_tasks, qa_tasks, attribute_tasks
+from .common import (
+    AutoMetric,
+    BasicMetric,
+    BasicTask,
+    CasualTask,
+    CommonSenseTask,
+    MultiTask,
+    SequenceClassificationTask,
+    task_dict,
+)
+from .qa_tasks import QuestionAnswerTask
+glue_tasks.update_task_dict(task_dict)
+qa_tasks.update_task_dict(task_dict)
+attribute_tasks.update_task_dict(task_dict)
+__all__ = [
+    "BasicMetric",
+    "AutoMetric",
+    "BasicTask",
+    "CasualTask",
+    "SequenceClassificationTask",
+    "CommonSenseTask",
+    "QuestionAnswerTask",
+    "MultiTask",
+    "task_dict",
+]

c2cite/tasks/attribute_tasks.py ADDED Viewed

	@@ -0,0 +1,567 @@

+import logging
+import random
+from typing import List, Optional
+import datasets as hf_datasets
+import torch
+import json
+import re
+import os
+from tqdm import tqdm
+from transformers import BertTokenizer, BertModel
+from moe_peft.common import InputData
+from moe_peft.tasks.common import AttributeTask, BasicMetric, AutoMetric
+class AttributedAnswerTask(AttributeTask):
+    def __init__(self) -> None:
+        super().__init__()
+    def loading_metric(self, metrics: List[str]):
+        return AutoMetric("attribute", metrics)
+class ASQA(AttributedAnswerTask):
+    def __init__(self, sub: str = 'vani'):
+        super().__init__()
+        self.inst = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. When citing several search results, use [1][2][3]. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.'
+        self.inst_special_token = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.'
+        self.inst_new = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite all of them at the end of the sentences. Use an unbiased and journalistic tone. Always cite for any factual claim. Cite at least one document in each sentence.'
+        self.sub = sub
+    def loading_data(self, is_train: bool = False, path: str = None, few_shot: bool = True
+                     ) -> List[InputData]:
+        few_shot = False #################################
+        num_docs = 5
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        relative_path = "../../dataset/ALCE-data/asqa_eval_gtr_top100.json"  # 向上两级再进入dataset目录
+        file_path = os.path.join(current_dir, relative_path)
+        with open(path if path is not None else file_path,'r',encoding='utf-8') as file:
+            data = json.load(file)
+        logging.info("Preparing data for ASQA")
+        ret: List[InputData] = []
+        #cnt = 5
+        """tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
+        model = BertModel.from_pretrained('bert-large-uncased')
+        device = 'cuda:6'
+        model = model.to(device)
+        model.eval()"""
+        for data_point in tqdm(data):
+            #if cnt == 0:
+            #    break
+            #cnt = cnt - 1
+            #prompt = ""
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\n" + "You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
+            #prompt += self.inst_new
+            prompt += self.inst_special_token
+            if few_shot:
+                prompt += f"Here is an example:\n\nQuestion: Who played galen in planet of the apes?\n\nDocument [1](Title: Planet of the Apes): installment. Jacobs died on June 27, 1973, bringing an end to the APJAC Productions era of the \"Planet of the Apes\" franchise. Former Fox executive Stan Hough took over as producer for the television project, titled \"Planet of the Apes\". CBS picked up the series for its 1974 autumn lineup. Ron Harper and James Naughton played Alan Virdon and Peter Burke, two 20th-century American astronauts who pass through a time warp to a future where apes subjugate humans (unlike the original film, the humans can speak). Roddy McDowall returned to the franchise as Galen, a chimpanzee who joins the astronauts.\nDocument [2](Title: Planet of the Apes (1968 film)): chimpanzees: animal psychologist Zira (Kim Hunter) and surgeon Galen (Wright King). While unable to speak as his throat wound is healing, called \"Bright Eyes\" by Zira and placed with one of the captive primitive humans he later names \"Nova\", Taylor observes the enhanced society of talking apes and in a strict caste system: the gorillas being the military police, hunters and workers; the orangutans overseeing the affairs of government, science, and religion; and intellectual chimpanzees being mostly scientists. While their society is a theocracy similar to the beginnings of the human Industrial Era, the apes consider the primitive humans as\nDocument [3](Title: Planet of the Apes (1968 film)): Planet of the Apes (1968 film) Planet of the Apes is a 1968 American science fiction film directed by Franklin J. Schaffner. It stars Charlton Heston, Roddy McDowall, Kim Hunter, Maurice Evans, James Whitmore, James Daly and Linda Harrison. The screenplay by Michael Wilson and Rod Serling was loosely based on the 1963 French novel \"La Plan\u00e8te des Singes\" by Pierre Boulle. Jerry Goldsmith composed the groundbreaking avant-garde score. It was the first in a series of five films made between 1968 and 1973, all produced by Arthur P. Jacobs and released by 20th Century Fox. The film tells the\nDocument [4](Title: Planet of the Apes): Rupert Wyatt. To portray ape characters realistically, the production avoided practical effects in favor of performance capture acting, partnering with New Zealand visual effects company Weta Digital. Wyatt cast James Franco as Will Rodman, while veteran performance capture actor Andy Serkis signed on to star as Caesar. \"Rise\" debuted on August 5, 2011. Critics reviewed it positively, especially praising the visual effects and Serkis's performance. It was a major box office hit, taking in $482 million globally, more than five times its $93 million budget. Weta's special effects earned the film two Visual Effects Society Awards and an Oscar nomination\nDocument [5](Title: Planet of the Apes): film stars Mark Wahlberg as astronaut Leo Davidson, who accidentally travels through a wormhole to a distant planet where talking apes enslave humans. He leads a human revolt and upends ape civilization by discovering that the apes evolved from the normal earth primates who had accompanied his mission, and arrived years before. Helena Bonham Carter played chimpanzee Ari, while Tim Roth played the human-hating chimpanzee General Thade. The film received mixed reviews; most critics believed it failed to compare to the original. Much of the negative commentary focused on the confusing plot and twist ending, though many reviewers praised the\n\nAnswer:In the 1968 film Planet of the Apes, Galen was played by Wright King [2]. And in the tv series Planet of the Apes, Galen was played by Roddy McDowall [1].\n\n\n"
+            #prompt += f"\n\n\nQusetion: {data_point['qa_pairs'][0]['question']}\n\n"
+            prompt += f"\n\n\nQusetion: {data_point['question']}\n\n"
+            docs = ""
+            cites = []
+            for i in range(num_docs):
+                cites.append({
+                    'text': data_point['docs'][i]['text'],
+                    'title': data_point['docs'][i]['title'],
+                    'summary': data_point['docs'][i]['summary'],
+                    })
+            #random.shuffle(cites)
+            for i in range(num_docs):
+                docs += f"Document <|reserved_special_token_{i+1}|>: {cites[i]['text'] if self.sub=='vani' else cites[i]['summary']}\n"
+                #docs += f"Document <|reserved_special_token_{i+1}|>(Title: {cites[i]['title']}): {cites[i]['text'] if self.sub=='vani' else cites[i]['summary']}\n"
+                #docs += f"Document [{i+1}](Title: {cites[i]['title']}): {cites[i]['text'] if self.sub=='vani' else cites[i]['summary']}\n"
+            cites = [cites[i]['text'] if self.sub=='vani' else cites[i]['summary'] for i in range(num_docs)]
+            prompt += docs
+            prompt += f"\nAnswer:"
+            # prompt += "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+            #citation_embeds = sents_embed(cites, model, tokenizer, device)
+            ret.append(InputData(inputs=prompt, labels=data_point['answer'], \
+                                 grounds=data_point['qa_pairs'], citations = cites,# citation_embeds = citation_embeds,\
+                                query = data_point['question']))
+        return ret
+    def loading_metric(self):
+        config = {}
+        config['task'] = 'asqa'
+        config['metric'] = metric_list['asqa']
+        return AutoMetric("attribute", config)
+class ELI5(AttributedAnswerTask):
+    def __init__(self, sub: str = 'vani'):
+        super().__init__()
+        self.inst = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. When citing several search results, use [1][2][3]. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.'
+        self.inst_special_token = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.'
+        self.sub = sub
+    def loading_data(self, is_train: bool = False, path: str = None, few_shot: bool = True
+                     ) -> List[InputData]:
+        few_shot = False ##############
+        num_docs = 5
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        relative_path = "../../dataset/ALCE-data/eli5_eval_bm25_top100.json"  # 向上两级再进入dataset目录
+        file_path = os.path.join(current_dir, relative_path)
+        with open(path if path is not None else file_path,'r',encoding='utf-8') as file:
+            data = json.load(file)
+        logging.info("Preparing data for ELI5")
+        ret: List[InputData] = []
+        #cnt = 5
+        for data_point in tqdm(data):
+            #if cnt == 0:
+            #    break
+            #cnt = cnt - 1
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\n" + "You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
+            #prompt += self.inst
+            prompt += self.inst_special_token
+            if few_shot:
+                prompt += f"Here is an example:\n\nQuestion: Who played galen in planet of the apes?\n\nDocument [1](Title: Planet of the Apes): installment. Jacobs died on June 27, 1973, bringing an end to the APJAC Productions era of the \"Planet of the Apes\" franchise. Former Fox executive Stan Hough took over as producer for the television project, titled \"Planet of the Apes\". CBS picked up the series for its 1974 autumn lineup. Ron Harper and James Naughton played Alan Virdon and Peter Burke, two 20th-century American astronauts who pass through a time warp to a future where apes subjugate humans (unlike the original film, the humans can speak). Roddy McDowall returned to the franchise as Galen, a chimpanzee who joins the astronauts.\nDocument [2](Title: Planet of the Apes (1968 film)): chimpanzees: animal psychologist Zira (Kim Hunter) and surgeon Galen (Wright King). While unable to speak as his throat wound is healing, called \"Bright Eyes\" by Zira and placed with one of the captive primitive humans he later names \"Nova\", Taylor observes the enhanced society of talking apes and in a strict caste system: the gorillas being the military police, hunters and workers; the orangutans overseeing the affairs of government, science, and religion; and intellectual chimpanzees being mostly scientists. While their society is a theocracy similar to the beginnings of the human Industrial Era, the apes consider the primitive humans as\nDocument [3](Title: Planet of the Apes (1968 film)): Planet of the Apes (1968 film) Planet of the Apes is a 1968 American science fiction film directed by Franklin J. Schaffner. It stars Charlton Heston, Roddy McDowall, Kim Hunter, Maurice Evans, James Whitmore, James Daly and Linda Harrison. The screenplay by Michael Wilson and Rod Serling was loosely based on the 1963 French novel \"La Plan\u00e8te des Singes\" by Pierre Boulle. Jerry Goldsmith composed the groundbreaking avant-garde score. It was the first in a series of five films made between 1968 and 1973, all produced by Arthur P. Jacobs and released by 20th Century Fox. The film tells the\nDocument [4](Title: Planet of the Apes): Rupert Wyatt. To portray ape characters realistically, the production avoided practical effects in favor of performance capture acting, partnering with New Zealand visual effects company Weta Digital. Wyatt cast James Franco as Will Rodman, while veteran performance capture actor Andy Serkis signed on to star as Caesar. \"Rise\" debuted on August 5, 2011. Critics reviewed it positively, especially praising the visual effects and Serkis's performance. It was a major box office hit, taking in $482 million globally, more than five times its $93 million budget. Weta's special effects earned the film two Visual Effects Society Awards and an Oscar nomination\nDocument [5](Title: Planet of the Apes): film stars Mark Wahlberg as astronaut Leo Davidson, who accidentally travels through a wormhole to a distant planet where talking apes enslave humans. He leads a human revolt and upends ape civilization by discovering that the apes evolved from the normal earth primates who had accompanied his mission, and arrived years before. Helena Bonham Carter played chimpanzee Ari, while Tim Roth played the human-hating chimpanzee General Thade. The film received mixed reviews; most critics believed it failed to compare to the original. Much of the negative commentary focused on the confusing plot and twist ending, though many reviewers praised the\n\nAnswer:In the 1968 film Planet of the Apes, Galen was played by Wright King [2]. And in the tv series Planet of the Apes, Galen was played by Roddy McDowall [1].\n\n\n"
+            prompt += f"\n\n\nQusetion: {data_point['question']}\n\n"
+            docs = ""
+            cites = []
+            for i in range(num_docs):
+                cites.append({
+                    'text': data_point['docs'][i]['text'],
+                    'title': data_point['docs'][i]['title'],
+                    'summary': data_point['docs'][i]['summary'],
+                    })
+            #random.shuffle(cites)
+            for i in range(num_docs):
+                docs += f"Document <|reserved_special_token_{i+1}|>: {cites[i]['text'] if self.sub=='vani' else cites[i]['summary']}\n"
+                #docs += f"Document [{i+1}](Title: {cites[i]['title']}): {cites[i]['text'] if self.sub=='vani' else cites[i]['summary']}\n"
+            cites = [cites[i]['text'] if self.sub=='vani' else cites[i]['summary'] for i in range(num_docs)]
+            prompt += docs
+            prompt += f"\nAnswer:"
+            # prompt += "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+            ret.append(InputData(inputs=prompt, labels=data_point['answer'], \
+                                 grounds=data_point['claims'], citations = cites, \
+                                query = data_point['question']))
+        return ret
+    def loading_metric(self):
+        config = {}
+        config['task'] = 'eli5'
+        config['metric'] = metric_list['eli5']
+        return AutoMetric("attribute", config)
+class Qampari(AttributedAnswerTask):
+    def __init__(self, sub: str = 'vani'):
+        super().__init__()
+        self.inst = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. When citing several search results, use [1][2][3]. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.'
+        self.inst_special_token = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.'
+        self.sub = sub
+    def loading_data(self, is_train: bool = False, path: str = None, few_shot: bool = True
+                     ) -> List[InputData]:
+        few_shot = False ##############
+        num_docs = 5
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        relative_path = "../../dataset/ALCE-data/qampari_eval_gtr_top100.json"  # 向上两级再进入dataset目录
+        file_path = os.path.join(current_dir, relative_path)
+        with open(path if path is not None else file_path,'r',encoding='utf-8') as file:
+            data = json.load(file)
+        logging.info("Preparing data for Qampari")
+        ret: List[InputData] = []
+        #cnt = 5
+        for data_point in tqdm(data):
+            #if cnt == 0:
+            #    break
+            #cnt = cnt - 1
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\n" + "You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
+            #prompt += self.inst
+            prompt += self.inst_special_token
+            if few_shot:
+                prompt += f"Here is an example:\n\nQuestion: Who played galen in planet of the apes?\n\nDocument [1](Title: Planet of the Apes): installment. Jacobs died on June 27, 1973, bringing an end to the APJAC Productions era of the \"Planet of the Apes\" franchise. Former Fox executive Stan Hough took over as producer for the television project, titled \"Planet of the Apes\". CBS picked up the series for its 1974 autumn lineup. Ron Harper and James Naughton played Alan Virdon and Peter Burke, two 20th-century American astronauts who pass through a time warp to a future where apes subjugate humans (unlike the original film, the humans can speak). Roddy McDowall returned to the franchise as Galen, a chimpanzee who joins the astronauts.\nDocument [2](Title: Planet of the Apes (1968 film)): chimpanzees: animal psychologist Zira (Kim Hunter) and surgeon Galen (Wright King). While unable to speak as his throat wound is healing, called \"Bright Eyes\" by Zira and placed with one of the captive primitive humans he later names \"Nova\", Taylor observes the enhanced society of talking apes and in a strict caste system: the gorillas being the military police, hunters and workers; the orangutans overseeing the affairs of government, science, and religion; and intellectual chimpanzees being mostly scientists. While their society is a theocracy similar to the beginnings of the human Industrial Era, the apes consider the primitive humans as\nDocument [3](Title: Planet of the Apes (1968 film)): Planet of the Apes (1968 film) Planet of the Apes is a 1968 American science fiction film directed by Franklin J. Schaffner. It stars Charlton Heston, Roddy McDowall, Kim Hunter, Maurice Evans, James Whitmore, James Daly and Linda Harrison. The screenplay by Michael Wilson and Rod Serling was loosely based on the 1963 French novel \"La Plan\u00e8te des Singes\" by Pierre Boulle. Jerry Goldsmith composed the groundbreaking avant-garde score. It was the first in a series of five films made between 1968 and 1973, all produced by Arthur P. Jacobs and released by 20th Century Fox. The film tells the\nDocument [4](Title: Planet of the Apes): Rupert Wyatt. To portray ape characters realistically, the production avoided practical effects in favor of performance capture acting, partnering with New Zealand visual effects company Weta Digital. Wyatt cast James Franco as Will Rodman, while veteran performance capture actor Andy Serkis signed on to star as Caesar. \"Rise\" debuted on August 5, 2011. Critics reviewed it positively, especially praising the visual effects and Serkis's performance. It was a major box office hit, taking in $482 million globally, more than five times its $93 million budget. Weta's special effects earned the film two Visual Effects Society Awards and an Oscar nomination\nDocument [5](Title: Planet of the Apes): film stars Mark Wahlberg as astronaut Leo Davidson, who accidentally travels through a wormhole to a distant planet where talking apes enslave humans. He leads a human revolt and upends ape civilization by discovering that the apes evolved from the normal earth primates who had accompanied his mission, and arrived years before. Helena Bonham Carter played chimpanzee Ari, while Tim Roth played the human-hating chimpanzee General Thade. The film received mixed reviews; most critics believed it failed to compare to the original. Much of the negative commentary focused on the confusing plot and twist ending, though many reviewers praised the\n\nAnswer:In the 1968 film Planet of the Apes, Galen was played by Wright King [2]. And in the tv series Planet of the Apes, Galen was played by Roddy McDowall [1].\n\n\n"
+            prompt += f"\n\n\nQusetion: {data_point['question']}\n\n"
+            docs = ""
+            cites = []
+            for i in range(num_docs):
+                cites.append({
+                    'text': data_point['docs'][i]['text'],
+                    'title': data_point['docs'][i]['title'],
+                    })
+            #random.shuffle(cites)
+            for i in range(num_docs):
+                docs += f"Document <|reserved_special_token_{i+1}|>: {cites[i]['text']}\n"
+                #docs += f"Document [{i+1}](Title: {cites[i]['title']}): {cites[i]['text'] if self.sub=='vani' else cites[i]['summary']}\n"
+            cites = [cites[i]['text'] for i in range(num_docs)]
+            prompt += docs
+            prompt += f"\nAnswer:"
+            # prompt += "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+            ret.append(InputData(inputs=prompt, labels=data_point['answers'], \
+                                 citations = cites, \
+                                query = data_point['question']))
+        return ret
+    def loading_metric(self):
+        config = {}
+        config['task'] = 'qam'
+        config['metric'] = metric_list['qam']
+        return AutoMetric("attribute", config)
+class QouteSum(AttributedAnswerTask):
+    def __init__(self, sub: str = 'vani'):
+        super().__init__()
+        self.sub = sub
+        self.inst = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. When citing several search results, use [1][2][3]. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.'
+        self.inst2 = 'Based on the information contained in the document, answer the question with details to the best of your bilities. Think step by step and explain your answer if that will help better understand the answer.'
+        self.inst_special_token = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.'
+        self.inst_new = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite all of them at the end of the sentences. Use an unbiased and journalistic tone. Always cite for any factual claim. Cite at least one document in each sentence.'
+    def loading_data(self, is_train: bool = False, path: str = None,
+                    few_shot: bool = True ) -> List[InputData]:
+        few_shot = False ###########
+        if is_train:
+            few_shot = False
+        ret: List[InputData] = []
+        examples_by_qid = {}
+        """tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
+        model = BertModel.from_pretrained('bert-large-uncased')
+        device = 'cuda:6'
+        model = model.to(device)
+        model.eval()"""
+        with open(f"/yy21/MoE-PEFT/dataset/{'qoutesum_alce' if self.sub == 'alce' else ( 'qoutesum_ans' if self.sub == 'ans' else 'qoutesum')}/{'train' if is_train else 'test'}.jsonl" if path is None else path, 'r') as f:
+            #cnt = 50
+            for line in f:
+                #if cnt == 0:
+                #    break
+                #cnt -= 1
+                example = json.loads(line.strip())
+                if example['qid'] not in examples_by_qid:
+                    examples_by_qid[example['qid']] = [example]
+                else:
+                    examples_by_qid[example['qid']].append(example)
+        examples = list(examples_by_qid.values())
+        for example in examples:
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\n" + "You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
+            prompt += self.inst_special_token
+            #prompt += self.inst_new
+            if few_shot:
+                if self.sub == 'alce':
+                    prompt += f" Here are some examples:\nQuestion: how much power does a wind turbine produce?\nDocument [1](Title:): Compact wind acceleration turbine: It is generally thought that since the amount of power produced by a wind turbine is proportional to the cube of the wind speed, any acceleration benefit is potentially statistically significant in the economics of wind. As noted though this is an inaccurate as it ignores the impact of the exit to area ratio and is therefore an apples to oranges comparison. In the case of a typical CWAT/DAWT the power result in perfect theoretical operation once adjusted for the area of the shroud is actually the square of the velocity at the rotor. As the CWAT/DAWT diverges from theoretical function the power increase drops significantly according\nDocument [2](Title:): Sustainable architecture: roof ledge. Small-scale rooftop wind turbines have been known to be able to generate power from 10% to up to 25% of the electricity required of a regular domestic household dwelling. Turbines for residential scale use are usually between 7 feet (2 m) to 25 feet (8 m) in diameter and produce electricity at a rate of 900 watts to 10,000 watts at their tested wind speed. Building integrated wind turbine performance can be enhanced with the addition of an aerofoil wing on top of a roof mounted turbine. Solar water heaters, also called solar domestic hot water systems, can\nDocument [3](Title:): Turby wind turbine: can because horizontal axis (HAWT) types cannot change their pitch to face the wind directly. The turbine measures 2.0m (6'7\") in diameter by 2.9m (9'6\") high (including generator), and weighs 136 kg (300 lb). It is specified to generate power in winds of between 4 m/s (9 mph, 7.8kts) and 14 m/s (31 mph, 27.2kts), and can survive winds of 55 m/s (123 mph, 107kts). The rated power at 14 m/s is 2.5 kW (3.35 hp). The AC output from the synchronous generator is rectified to DC, then inverted to AC at 230V 50 Hz. Core International developed the turbine\nAnswer: One source states the amount of power produced by a wind turbine is proportional to the cube of the wind speed [1]. Other sources state Turbines for residential scale use produce electricity at a rate of 900 watts to 10,000 watts, and is specified to generate power in winds of between 4 m/s (9 mph, 7.8kts) and 14 m/s (31 mph, 27.2kts) [2][3]."
+                elif self.sub == 'vani':
+                    prompt += f" Here are some examples:\nQuestion: how much power does a wind turbine produce?\n[1] Compact wind acceleration turbine: It is generally thought that since the amount of power produced by a wind turbine is proportional to the cube of the wind speed, any acceleration benefit is potentially statistically significant in the economics of wind. As noted though this is an inaccurate as it ignores the impact of the exit to area ratio and is therefore an apples to oranges comparison. In the case of a typical CWAT/DAWT the power result in perfect theoretical operation once adjusted for the area of the shroud is actually the square of the velocity at the rotor. As the CWAT/DAWT diverges from theoretical function the power increase drops significantly according\n[2] Sustainable architecture: roof ledge. Small-scale rooftop wind turbines have been known to be able to generate power from 10% to up to 25% of the electricity required of a regular domestic household dwelling. Turbines for residential scale use are usually between 7 feet (2 m) to 25 feet (8 m) in diameter and produce electricity at a rate of 900 watts to 10,000 watts at their tested wind speed. Building integrated wind turbine performance can be enhanced with the addition of an aerofoil wing on top of a roof mounted turbine. Solar water heaters, also called solar domestic hot water systems, can\n[3] Turby wind turbine: can because horizontal axis (HAWT) types cannot change their pitch to face the wind directly. The turbine measures 2.0m (6'7\") in diameter by 2.9m (9'6\") high (including generator), and weighs 136 kg (300 lb). It is specified to generate power in winds of between 4 m/s (9 mph, 7.8kts) and 14 m/s (31 mph, 27.2kts), and can survive winds of 55 m/s (123 mph, 107kts). The rated power at 14 m/s is 2.5 kW (3.35 hp). The AC output from the synchronous generator is rectified to DC, then inverted to AC at 230V 50 Hz. Core International developed the turbine\nAnswer: One source states the [ 1 amount of power produced by a wind turbine is proportional to the cube of the wind speed ] . Other sources state [ 2 Turbines for residential scale use ] [ 2 produce electricity at a rate of 900 watts to 10,000 watts ] , and [ 3 is specified to generate power in winds of between 4 m/s (9 mph, 7.8kts) and 14 m/s (31 mph, 27.2kts) ] .\n\nQuestion: a component is what?\n[1] Modular programming: in Dart, Go or Java) is sometimes used instead of module. In other implementations, this is a distinct concept; in Python a package is a collection of modules, while in Java 9 the introduction of the new module concept (a collection of packages with enhanced access control) is planned. Furthermore, the term \"package\" has other uses in software (for example .NET NuGet packages). A component is a similar concept, but typically refers to a higher level; a component is a piece of a whole system, while a module is a piece of an individual program. The scale of the term\n[2] Physical body: the system at a point in time changes from identifying the object to not identifying it. Also an object's identity is created at the first point in time that the simplest model of the system consistent with perception identifies it. An object may be composed of components. A component is an object completely within the boundary of a containing object. In classical mechanics a physical body is collection of matter having properties including mass, velocity, momentum and energy. The matter exists in a volume of three-dimensional space. This space is its extension. Under Newtonian gravity the gravitational field further away\nQuoted summary: [ 1 A component is a similar concept, but typically refers to a higher level; a component is a piece of a whole system, while a module is a piece of an individual program ] in terms of [ 1 Modular programming ] . Whereas in the [ 2 Physical body ] , a [ 2 component is an object completely within the boundary of a containing object ] ."
+                elif self.sub == 'ans':
+                    pass
+            prompt += f"\n\nQusetion: {example[0]['question']}\n"
+            docs = ""
+            sources = []
+            citations = []
+            #fk = 0
+            for i in range(8):
+                if f"title{i+1}" not in example[0]:
+                    break
+                #if example[0][f'title{i+1}'] == "":
+                #    fk = i
+                sources.append({'title': example[0][f'title{i+1}'],
+                    'doc': example[0][f"source{i+1}"]}
+                    )
+            #random.shuffle(sources[:fk])
+            for i in range(8):
+                if sources[i]['doc'] != "":
+                    #docs += f"Document [{i+1}](Title: {sources[i]['title']}): {sources[i]['doc']}\n"
+                    #docs += f"Document <|reserved_special_token_{i+1}|>(Title: {sources[i]['title']}): {sources[i]['doc']}\n"
+                    docs += f"Document <|reserved_special_token_{i+1}|>: {sources[i]['doc']}\n"
+                    citations.append(sources[i]['doc'])
+                else:
+                    break
+            if len(citations) == 0:
+                continue
+            #citations = sents_embed(citations, model, tokenizer, device)
+            prompt += docs
+            prompt += f"\nAnswer:"
+            if is_train:
+                for e in example:
+                    #ret.append(InputData(inputs = prompt + e['summary']))
+                    ret.append(InputData(inputs = prompt + cite2token(e['summary']),
+                                         citations=citations, prompt = prompt))
+            else:
+                ret.append(InputData(inputs=prompt, labels=[e['summary'] for e in example], \
+                                    grounds=[i for e in example for i in e['covered_short_answers']], \
+                                    citations=citations, query = example[0]['question']))
+        return ret
+    def loading_metric(self):
+        config = {}
+        config['task'] = 'qsum'
+        if self.sub == 'alce':
+            config['metric'] = metric_list['qsum-a']
+        else:
+            config['metric'] = metric_list['qsum']
+        return AutoMetric("attribute", config)
+class Front(AttributedAnswerTask):
+    def __init__(self, sub):
+        super().__init__()
+        self.inst = 'Extract the relevant content from the provided documents and then use the extracted content to guide answer generation and cite the sources properly.'
+        self.sub = sub
+    def loading_data(self, is_train: bool = False, few_shot: bool = True
+                     ) -> List[InputData]:
+        few_shot = False ##############
+        with open("/yy21/MoE-PEFT/dataset/front/sft.json" if self.sub == 'sft' else "/yy21/MoE-PEFT/dataset/front/dpo.json",'r',encoding='utf-8') as file:
+            data = json.load(file)
+        logging.info("Preparing data for Front")
+        ret: List[InputData] = []
+        #cnt = 2
+        for data_point in data:
+            if data_point['instruction'] != self.inst:
+                continue
+            #if cnt == 0:
+            #    break
+            #cnt = cnt - 1
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\n" + "You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
+            prompt += self.inst
+            prompt += data_point['input']
+            prompt += "\nAnswer:"
+            prompt = cite2token(prompt)
+            q_start = len("Question: ")
+            q_end = data_point['input'].find("\n\n", q_start)
+            q = data_point['input'][q_start:q_end]
+            cites = []
+            pattern = r"Document \[(\d+)\]: (.*?)(?=Document \[\d+\]:|$)"
+            matches = re.findall(pattern, data_point['input'][q_end + 2:], re.DOTALL)
+            cites = [content.strip() for _, content in matches]
+            #random.shuffle(cites)
+            ans_idx = data_point['output'].find("[ANSWER]")
+            ans = cite2token(data_point['output'][ans_idx + len("[ANSWER]"):])
+            if is_train:
+                ret.append(InputData(inputs = prompt + ans, prompt = prompt, citations=cites))
+            else:
+                ret.append(InputData(inputs=prompt, labels=ans, \
+                                    citations = cites, query = q))
+        return ret
+    def loading_metric(self):
+        config = {}
+        config['task'] = 'front'
+        config['metric'] = metric_list['front']
+        return AutoMetric("attribute", config)
+class Synsciqa(AttributedAnswerTask):
+    def __init__(self, sub):
+        super().__init__()
+        self.sub = sub
+        self.inst = lambda query: f"Can you respond to the question {query} by only relying on the sources. Ignore all sources that do not provide an answer to the question.                    Do not include any knowledge from outside of these sources. Only write a single paragraph. Each sentence must end with the reference in the form of (author, year, page number). Stricly follow this format. Citing multiple sources in one sentence is not allowed.                    However, if no source addresses the question, admit truthfully that no answer can be given.                    Answer the question concisly and avoid being verbose."
+        self.inst_a = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. When citing several search results, use [1][2][3]. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.'
+        self.inst_special_token = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.'
+        self.inst_new = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite all of them at the end of the sentences. Use an unbiased and journalistic tone. Always cite for any factual claim. Cite at least one document in each sentence.'
+    def loading_data(self, is_train: bool = False, few_shot: bool = True
+                     ) -> List[InputData]:
+        few_shot = False ##############
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+          # 向上两级再进入dataset目录
+        if self.sub == 'synsci':
+            relative_path = "../../dataset/SynSciQA/SynSciQA.json"
+        elif self.sub == 'synsci+':
+            relative_path = "../../dataset/SynSciQA/SynSciQA+.json"
+        elif self.sub == 'synsci++':
+            relative_path = "../../dataset/SynSciQA/SynSciQA++.json"
+        file_path = os.path.join(current_dir, relative_path)
+        with open(file_path, 'r',encoding='utf-8') as file:
+            data = json.load(file)
+        logging.info("Preparing data for SynsciQA")
+        ret: List[InputData] = []
+        #cnt = 10
+        """tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
+        model = BertModel.from_pretrained('bert-large-uncased')
+        device = 'cuda:4'
+        model = model.to(device)
+        model.eval()"""
+        for line in tqdm(data):
+            #if cnt == 0:
+            #    break
+            #cnt -= 1
+            data_point = line["instruction"]
+            answer = line["response"]
+            doc_start = data_point.find("[BEGIN OF SOURCES]")
+            doc_end = data_point.find("[END OF SOURCES]")
+            documents = data_point[doc_start + len("[BEGIN OF SOURCES]"): doc_end].strip().split("\n")
+            assert len(documents) > 0, print(f"No docs detected!")
+            data_point = data_point[doc_end + len("[END OF SOURCES]"):]
+            pattern = r'"([^"]*)"'
+            query = re.findall(pattern, data_point)
+            #prompt = ""
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\n" + "You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
+            #prompt += self.inst_special_token
+            #prompt += self.inst_new
+            #prompt += self.inst_a
+            prompt += f"\n\nQuestion: {query[0]}\n"
+            docs = ""
+            citations = []
+            index_map = []
+            index_map2 = []
+            for i, d in enumerate(documents):
+                Ids = d[:d.find(":")]
+                cont = d[d.find(":") + 2:]
+                docs += f"Document <|reserved_special_token_{i+1}|>: {cont}\n"
+                #docs += f"Document [{i+1}]: {cont}\n"
+                citations.append(cont)
+                index_map.append({'index': f"({Ids})", 'ID': f'<|reserved_special_token_{i+1}|>'})
+                index_map2.append({'index': f"{Ids}", 'ID': f'<|reserved_special_token_{i+1}|>'})
+                #index_map.append({'index': f"({Ids})", 'ID': f'[{i+1}]'})
+            index_map = {item['index']: item['ID'] for item in index_map}
+            index_map2 = {item['index']: item['ID'] for item in index_map2}
+            prompt +=docs
+            prompt += "\nAnswer:"
+            pattern = re.compile('|'.join(map(re.escape, index_map)))
+            answer = pattern.sub(lambda m: index_map[m.group()], answer)
+            pattern = re.compile('|'.join(map(re.escape, index_map2)))
+            answer = pattern.sub(lambda m: index_map2[m.group()], answer)
+            pattern = r'\(\s*(<\|[^|]+\|>)\s*;\s*(<\|[^|]+\|>)\s*\)'
+            answer = re.sub(pattern, r'\1\2', answer)
+            pattern = r'<\|reserved_special_token_\d+\|>'
+            if bool(re.search(pattern, answer)) == False:
+                continue
+            pattern = r"\((?:[^)]*,){2}[^)]*p\.[^)]*\)"
+            fk = re.findall(pattern, answer)
+            if fk:
+                continue
+            #print(f"inputs:{prompt}\nans:{answer}\ncite{citations}")
+            #input()
+            #citation_embeds = sents_embed(citations, model, tokenizer, device)
+            if is_train:
+                ret.append(InputData(
+                    inputs = prompt + answer, citations = citations, prompt = prompt#, citation_embeds = citation_embeds,
+                ))
+        return ret
+    def loading_metric(self):
+        config = {}
+        config['task'] = 'front'
+        config['metric'] = metric_list['front']
+        return AutoMetric("attribute", config)
+class Reinf(AttributedAnswerTask):
+    def __init__(self):
+        super().__init__()
+        self.inst_a = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. When citing several search results, use [1][2][3]. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.'
+        self.inst_special_token = 'Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.'
+    def loading_data(self, is_train: bool = False, few_shot: bool = True
+                     ) -> List[InputData]:
+        few_shot = False ##############
+        with open("/yy21/MoE-PEFT/dataset/reinforcement/combined_train.json", 'r',encoding='utf-8') as file:
+            data = json.load(file)
+        logging.info("Preparing data for Reinforcement")
+        ret: List[InputData] = []
+        #cnt = 305
+        for line in tqdm(data):
+            #if cnt == 0:
+            #    break
+            answer = line["output"][0]
+            if bool(re.search(r'\[(\d+)\]', answer)) == False:
+                continue
+            cs = re.findall(r'\[(\d+)\]', answer)
+            if max(map(int, cs)) > len(line["docs"]):
+                continue
+            query = line["question"]
+            documents = line["docs"]
+            answer = self.get_ans(answer)
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\n" + "You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
+            prompt += self.inst_special_token
+            #prompt += self.inst_new
+            #prompt += self.inst_a
+            prompt += f"\n\nQuestion: {query}\n"
+            docs = ""
+            citations = []
+            for i, d in enumerate(documents):
+                docs += f"Document <|reserved_special_token_{i+1}|>: {d['text']}\n"
+                citations.append(d["text"])
+            prompt +=docs
+            prompt += "\nAnswer:"
+            #cnt -= 1
+            if is_train:
+                ret.append(InputData(
+                    inputs = prompt + answer, citations = citations, prompt = prompt
+                ))
+        return ret
+    def get_ans(self, sent):
+        def replace_cite(x):
+            i = x.group(1)
+            return f"<|reserved_special_token_{i}|>"
+        return re.sub(r'\[(\d+)\]', replace_cite, sent)
+    def loading_metric(self):
+        config = {}
+        config['task'] = 'front'
+        config['metric'] = metric_list['front']
+        return AutoMetric("attribute", config)
+def sents_embed(sents, model, tokenizer, device):
+    embeds = []
+    with torch.no_grad():
+        for sent in sents:
+            inputs = tokenizer(sent, return_tensors='pt', padding=True, truncation=True)
+            inputs = inputs.to(device)
+            output = model(**inputs)
+            embeds.append(output.pooler_output)
+    result = torch.stack(embeds).squeeze(1)
+    return result
+def cite2token(sent):
+    pattern = r'\[(\d+)\]'
+    ans = re.sub(pattern, r'<|reserved_special_token_\g<1>|>', sent)
+    return ans
+metric_list = {
+    'asqa': ['cite_pr', 'length', 'short_ans'],
+    'qsum': ['rouge_all', 'semqa_f1', 'semqa_short'],
+    'qsum-a': ['rouge_all','semqa_short', 'cite_pr', 'length', 'semqa_f1'],
+    'eli5': ['cite_pr', 'eli5_acc', 'length'],
+    'qam': ['cite_pr', 'qampari'],
+    'front': [],
+}
+def update_task_dict(task_dict):
+    task_dict.update(
+        {
+            "asqa": ASQA(),
+            "qsum": QouteSum('vani'),
+            "qsum-a": QouteSum('alce'),
+            "qsum-ans": QouteSum('ans'),
+            "eli5": ELI5(),
+            "front-s": Front('sft'),
+            "front-d": Front('dpo'),
+            "synsci": Synsciqa('synsci'),
+            "synsci+": Synsciqa('synsci+'),
+            "synsci++": Synsciqa('synsci++'),
+            "rein": Reinf(),
+            "qam": Qampari()
+        }
+    )
+if __name__ == '__main__':
+    asqa = QouteSum()
+    asqa.loading_data()

c2cite/tasks/common.py ADDED Viewed

	@@ -0,0 +1,1045 @@

+import logging
+import os
+import re
+import json
+import copy
+import string
+from nltk import sent_tokenize
+from tqdm import tqdm
+import numpy as np
+from rouge import Rouge
+import collections
+from rouge_score import rouge_scorer, scoring
+import functools
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import transformers
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+import datasets as hf_datasets
+import evaluate as hf_evaluate
+import torch
+from moe_peft.common import InputData, Prompt
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+global autoais_model, autoais_tokenizer
+autoais_model = None
+autoais_tokenizer = None
+qa_pipeline = None
+get_docs_by_index = lambda i,docs: docs[i] if i < len(docs) else None
+ais_LLM = None
+evaluate_device = 'cuda:6'
+#QA_MODEL = "gaotianyu1350/roberta-large-squad"
+QA_MODEL = "/yy21/qa_model"
+#AUTOAIS_MODEL = "google/t5_xxl_true_nli_mixture"
+AUTOAIS_MODEL = "/yy21/autoais"
+class BasicMetric:
+    def __init__(self) -> None:
+        pass
+    def add_batch(self, data):
+        pass
+    def add_batch(self, predictions: torch.Tensor, references: torch.Tensor):
+        pass
+    def compute(self) -> Dict[str, Any]:
+        pass
+from statistics import harmonic_mean
+def normalize_answers(text):
+  """QA style answer normalization. Similar to TriviaQA."""
+  def remove_articles(s):
+    return re.sub(r"\b(a|an|the)\b", " ", s)
+  def replace_punctuation(s):
+    to_replace = set(string.punctuation)
+    return "".join(" " if ch in to_replace else ch for ch in s)
+  def white_space_fix(s):
+    return " ".join(s.split())
+  text = text.lower()
+  text = replace_punctuation(text)
+  text = remove_articles(text)
+  text = white_space_fix(text)
+  return text
+def strip_attribution_tokens(text):
+  """Strip the attribution tokens from an answer."""
+  return re.sub(r'\[ ([1-9]) ([^\[\]]*) \]',r'\2' , text)
+def non_quoted(text):
+  """Returns only the text that is outside of quoted spans."""
+  return re.sub(r'\[ ([1-9]) ([^\[\]]*) \]', '' , text)
+def only_quoted(text, sources='1-9', sep = ' '):
+  """Returns only the text that is within of quoted spans."""
+  return sep.join([x.group(1) for x in re.finditer(r'\[ [{}] ([^\[\]]*) \]'.format(sources), text)])
+def quoted_sources(text):
+  """Returns the list of input sources that were quoted in the answer."""
+  return sorted(list(set([int(x.group(1)) for x in re.finditer(r'\[ ([1-9]) [^\[\]]* \]', text)])))
+def score_all(data, scorer, aggr_measure, score_keys, preprocess_func=None, bootstrap=False):
+  """
+  Aggregates across all targets per sample.
+  all_targets: list of list of strings
+  all_predictions: list of strings
+  """
+  all_targets = [d['answer'] for d in data]
+  all_predictions = [d['output'] for d in data]
+  np.random.seed(1337)
+  is_rouge_measure = 'rouge' in aggr_measure
+  if preprocess_func is not None:
+    scoring_func = lambda target, prediction: scorer.score(target=preprocess_func(target), prediction=preprocess_func(prediction))
+  else:
+    scoring_func = scorer.score
+  aggregator = scoring.BootstrapAggregator()
+  all_scores = [] if is_rouge_measure else dict((k,[]) for k in score_keys)
+  for targets, prediction in zip(all_targets, all_predictions):
+    # Max across references by aggr_measure
+    if is_rouge_measure:
+      max_scores = max([scoring_func(target, prediction) for target in targets], key=lambda x: x[aggr_measure].fmeasure)
+      aggregator.add_scores(max_scores)
+      all_scores.append(max_scores[aggr_measure].fmeasure*100)
+    else:
+      if aggr_measure == 'independent':
+        max_scores = {}
+        for key in score_keys:
+          max_scores[key] = max([scoring_func(target, prediction)[key] for target in targets])
+      else:
+        max_scores = max([scoring_func(target, prediction) for target in targets], key=lambda x: x[aggr_measure])
+      aggregator.add_scores(max_scores)
+      for key in score_keys:
+        all_scores[key].append(max_scores[key]*100)
+  if not bootstrap:
+    return all_scores
+  result = aggregator.aggregate()
+  postprocess_result = (lambda x: x.fmeasure*100) if is_rouge_measure else (lambda x: x*100)
+  bootstrap_results = {}
+  for key in score_keys:
+    bootstrap_results[key] = (postprocess_result(result[key].mid), postprocess_result(result[key].low), postprocess_result(result[key].high))
+  return bootstrap_results, all_scores
+## ROUGE ##
+score_all_rouge = functools.partial(score_all, scorer=rouge_scorer.RougeScorer(rouge_types=("rouge1", "rouge2", "rougeLsum", "rougeL")), aggr_measure='rougeLsum',  score_keys=("rouge1", "rouge2", "rougeLsum"), preprocess_func=strip_attribution_tokens, bootstrap=True)
+## F1 ##
+class _f1_scorer:
+  def score(self, target, prediction):
+    """Computes token F1 score for a single target and prediction."""
+    prediction_tokens = prediction.split()
+    target_tokens = target.split()
+    common = (collections.Counter(prediction_tokens) &
+              collections.Counter(target_tokens))
+    num_same = sum(common.values())
+    if len(target_tokens) == 0 and len(prediction_tokens) == 0:
+      return {'F1': 1.0, 'recall': 1.0, 'precision': 1.0}
+    elif len(target_tokens) == 0 and len(prediction_tokens) > 0:
+      return {'F1': 0.0, 'recall': 1.0, 'precision': 0.0}
+    elif len(target_tokens) > 0 and len(prediction_tokens) == 0:
+      return {'F1': 0.0, 'recall': 0.0, 'precision': 1.0}
+    elif num_same == 0:
+      return {'F1': 0.0, 'recall': 0.0, 'precision': 0.0}
+    else:
+      precision = 1.0 * num_same / len(prediction_tokens)
+      recall = 1.0 * num_same / len(target_tokens)
+      f1 = (2 * precision * recall) / (precision + recall)
+      return {'F1': f1, 'recall': recall, 'precision': precision}
+score_all_f1 = functools.partial(score_all, scorer=_f1_scorer(), aggr_measure='F1', score_keys=("F1", "recall", "precision"))
+def preprocess_quotes_f1(text, sep=' ', sources='1-7'):
+  text = only_quoted(text, sep=sep, sources=sources)
+  return normalize_answers(text)
+def score_semqa_f1(data, harmonic=False):
+  examples = [d['docs'] for d in data]
+  per_source_prf1 = {}
+  for source in range(1, 8):
+    preprocess_quotes_f1_partial_sources = functools.partial(preprocess_quotes_f1, sep=' ', sources=f'{source}')
+    scores = score_all_f1(data, aggr_measure='independent', preprocess_func=preprocess_quotes_f1_partial_sources)
+    for aggr_measure in ('F1', 'recall', 'precision'):
+      per_source_prf1[f'{aggr_measure}_source_{source}'] = scores[aggr_measure]
+  semqa_f1s = []
+  for i in range(len(examples)):
+    precisions, recalls, f1s = [], [] , []
+    for source in range(1,8):
+      if examples[i][source]:
+        precisions.append(per_source_prf1[f'precision_source_{source}'][i])
+        recalls.append(per_source_prf1[f'recall_source_{source}'][i])
+        f1s.append(per_source_prf1[f'F1_source_{source}'][i])
+    if harmonic:
+      f1 = harmonic_mean(precisions + recalls)
+    else:
+      f1 = np.mean(f1s)
+    semqa_f1s.append(f1)
+  return np.mean(semqa_f1s)
+score_all_recall = functools.partial(score_all, scorer=_f1_scorer(), aggr_measure='recall', score_keys=("recall",))
+def score_semqa_short_recall(data):
+  if 'num' in data[0]['qa_pairs'][0].keys():
+    return compute_str_em(
+       [
+        {
+            'qa_pairs': [
+               {
+                'short_answers': i['ans'],
+               }for i in d['qa_pairs']],
+            'output': d['output']
+        }
+        for d in data]
+    )
+  all_targets = [d['qa_pairs'] for d in data]
+  all_predictions = [d['output'] for d in data]
+  fuck = []
+  # Ignore examples with no targets.
+  non_empty_targets, non_empty_predictions = [], []
+  for tar, pred in zip(all_targets, all_predictions):
+    if len(tar) == 0 or all([x == '' for x in tar]):
+      continue
+    fuck.append({
+       'answer': tar,
+       'output': pred,
+    })
+    non_empty_targets.append(tar)
+    non_empty_predictions.append(pred)
+  per_source_recall = {}
+  for source in range(1, 8):
+    preprocess_quotes_f1_partial_sources = functools.partial(preprocess_quotes_f1, sep=' ', sources=f'{source}')
+    scores = score_all_recall(fuck, preprocess_func=preprocess_quotes_f1_partial_sources)
+    per_source_recall[f'recall_source_{source}'] = scores['recall']
+  semqa_recalls = []
+  for i in range(len(non_empty_targets)):
+    recalls = []
+    for source in range(1,8):
+      preprocess_quotes_f1_partial_sources = functools.partial(preprocess_quotes_f1, sep=' ', sources=f'{source}')
+      if any([preprocess_quotes_f1_partial_sources(tar) for tar in non_empty_targets[i]]):
+        recalls.append(per_source_recall[f'recall_source_{source}'][i])
+      avg_recalls = np.mean(recalls)
+    semqa_recalls.append(avg_recalls)
+  return np.mean(semqa_recalls)
+def exact_presence(short_answers, context):
+    """Verify if any of the answers is present in the given context.
+    Args:
+        short_answers: list of short answers to look for in the context
+        context: a paragraph to search for short answers
+    Returns:
+        true if any of the short answers is present in the context
+    """
+    n_short_answers = [normalize_answer(sa) for sa in short_answers]
+    n_context = normalize_answer(context)
+    for ans in n_short_answers:
+        if ans in n_context:
+            return True
+    return False
+def normalize_answer(s):
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+    def white_space_fix(text):
+        return " ".join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+def remove_citations(sent):
+    return re.sub(r"\[\d+", "", re.sub(r" \[\d+", "", sent)).replace(" |", "").replace("]", "")
+def load_auto_ais():
+    global autoais_model, autoais_tokenizer
+    print('Initializing eval model for citation precision and recall...')
+    autoais_model = AutoModelForSeq2SeqLM.from_pretrained(AUTOAIS_MODEL, torch_dtype=torch.bfloat16, device_map=evaluate_device, )
+    autoais_tokenizer = AutoTokenizer.from_pretrained(AUTOAIS_MODEL, use_fast=False)
+    print('Done!')
+def _run_nli_autoais(passage, claim, test = False):
+    """
+    Run inference for assessing AIS between a premise and .hypothesis
+    Adapted from https://github.com/google-research-datasets/Attributed-QA/blob/main/evaluation.py
+    """
+    if not test:
+        global autoais_model, autoais_tokenizer
+        if not autoais_model:
+            load_auto_ais()
+        input_text = "premise: {} hypothesis: {}".format(passage, claim)
+        input_ids = autoais_tokenizer(input_text, return_tensors="pt").input_ids.to(autoais_model.device)
+        with torch.inference_mode():
+            outputs = autoais_model.generate(input_ids, max_new_tokens=10)
+        result = autoais_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        inference = 1 if result == "1" else 0
+        return inference
+    else:
+        res = 114514
+    return res
+def compute_autoais(data,
+                    qampari=False,
+                    at_most_sents = 50,
+                    at_most_citations=3,
+                    entail_function = _run_nli_autoais):
+    """
+    Compute AutoAIS score.
+    Args:
+        data: requires field `output` and `docs`
+              - docs should be a list of items with fields `title` and `text` (or `phrase` and `sent` for QA-extracted docs)
+        citation: check citations and use the corresponding references.
+        decontext: decontextualize the output
+    """
+    global autoais_model, autoais_tokenizer
+    ais_scores = []
+    ais_scores_prec = []
+    sent_total = 0
+    sent_mcite = 0
+    sent_mcite_support = 0
+    sent_mcite_overcite = 0
+    autoais_log = []
+    for item in tqdm(data):
+        # Get sentences by using NLTK
+        if qampari:
+            #print('now qampari...')
+            sents = [item['query'] + " " + x.strip() for x in
+                     item['output'].rstrip().rstrip(".").rstrip(",").split(",")]
+        else:
+            sents = sent_tokenize(item['output'])[:at_most_sents]
+        if len(sents) == 0:
+            ais_scores.append(0.0)
+            ais_scores_prec.append(0.0)  # len(sents))
+            continue
+        target_sents = [remove_citations(sent).strip() for sent in sents]
+        entail = 0
+        entail_prec = 0
+        total_citations = 0
+        for sent_id, sent in enumerate(sents):
+            target_sent = target_sents[sent_id]  # Citation removed and (if opted for) decontextualized
+            joint_entail = -1  # Undecided
+            # Find references
+            #ref = [int(r[1:]) - 1 for r in re.findall(r"\[\d+", sent)]  # In text citation id starts from 1
+            matches = re.findall(r"\[(\d+(?:,\s*\d+)*)\]", sent)
+            ref = [int(num)-1 for match in matches for num in match.replace(' ', '').split(',')]
+            if len(ref) == 0:
+                # No citations
+                joint_entail = 0
+            elif any([ref_id >= len(item['docs']) for ref_id in ref]):
+                # Citations out of range
+                joint_entail = 0
+            else:
+                if at_most_citations is not None:
+                    ref = ref[:at_most_citations]
+                total_citations += len(ref)
+                joint_passage = '\n'.join([(item['docs'][psgs_id]) for psgs_id in ref])
+            # If not directly rejected by citation format error, calculate the recall score
+            if joint_entail == -1:
+                joint_entail = entail_function(joint_passage, target_sent)
+                autoais_log.append({
+                    #"question": item['question'],
+                    "output": item['output'],
+                    "claim": sent,
+                    "passage": [joint_passage],
+                    "model_type": "NLI",
+                    "model_output": joint_entail,
+                })
+            entail += joint_entail
+            if len(ref) > 1:
+                sent_mcite += 1
+            # calculate the precision score if applicable
+            if joint_entail and len(ref) > 1:
+                sent_mcite_support += 1
+                # Precision check: did the model cite any unnecessary documents?
+                for psgs_id in ref:
+                    # condition A
+                    passage = item['docs'][psgs_id]
+                    nli_result = entail_function(passage, target_sent)
+                    # condition B
+                    if not nli_result:
+                        subset_exclude = copy.deepcopy(ref)
+                        subset_exclude.remove(psgs_id)
+                        passage = '\n'.join([item['docs'][pid] for pid in subset_exclude])
+                        nli_result =entail_function(passage, target_sent)
+                        if nli_result:  # psgs_id is not necessary
+                            flag = 0
+                            sent_mcite_overcite += 1
+                        else:
+                            entail_prec += 1
+                    else:
+                        entail_prec += 1
+            else:
+                entail_prec += joint_entail
+        sent_total += len(sents)
+        ais_scores.append(entail / len(sents))
+        ais_scores_prec.append(entail_prec / total_citations if total_citations > 0 else 0)  # len(sents))
+    if sent_mcite > 0 and sent_mcite_support > 0:
+        print(
+            "Among all sentences, %.2f%% have multiple citations, among which %.2f%% are supported by the joint set, among which %.2f%% overcite." % (
+                100 * sent_mcite / sent_total,
+                100 * sent_mcite_support / sent_mcite,
+                100 * sent_mcite_overcite / sent_mcite_support
+            ))
+    return {
+        "citation_rec": 100 * np.mean(ais_scores),
+        "citation_prec": 100 * np.mean(ais_scores_prec),
+    }
+def compute_f1(a_gold, a_pred):
+    """Compute F1 score between two strings."""
+    def _get_tokens(s):
+        if not s:
+            return []
+        return normalize_answer(s).split()
+    gold_toks = _get_tokens(a_gold)
+    pred_toks = _get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+def compute_exact(a_gold, a_pred):
+    """Check whether two strings are equal up to normalization."""
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+def compute_qa(data):
+    """Compute QA-based accuracy.
+    Args:
+        data: requires filed `qa_pairs/short_answers` and `output`
+    Returns:
+        QA metrics (QA-EM, QA-F1, QA-Hit)
+    """
+    if 'qa_pairs' not in data[0] or data[0]['qa_pairs'] is None:
+        logging.warn("Warning: no QA pairs found in data")
+        return {
+            'QA-EM': 0,
+            'QA-F1': 0,
+            'QA-Hit': 0,
+        }
+    # Load model
+    #logger.info("Loading the RoBERTa-large SQuAD model for QA-based accuracy...")
+    global qa_pipeline
+    if not qa_pipeline:
+        qa_pipeline = transformers.pipeline("question-answering", model=QA_MODEL, device = evaluate_device)
+    #logger.info("Done")
+    # Get prediction
+    #logger.info("Computing the QA-based accuracy...")
+    em, f1, bins = [], [], []
+    for item in tqdm(data):
+        question = [qa_pair['question'] for qa_pair in item['qa_pairs']]
+        #question = [item['qa_pairs'][0]['question']]
+        context = item['output'] if len(item['output']) > 0 else " "
+        results = qa_pipeline(question=question, context=remove_citations(context), handle_impossible_answer=True)
+        loc_counter, loc_em, loc_f1 = 0, 0, 0
+        for idx, res in enumerate(results):
+            answers = item["qa_pairs"][idx]["short_answers"]
+            prediction = res["answer"]
+            loc_em += max([compute_exact(a, prediction) for a in answers])
+            loc_f1 += max([compute_f1(a, prediction) for a in answers])
+            loc_counter += 1
+        em.append(loc_em / loc_counter)
+        f1.append(loc_f1 / loc_counter)
+        bins.append(loc_em == loc_counter)
+    return {
+        'QA-EM': 100 * np.mean(em),
+        'QA-F1': 100 * np.mean(f1),
+        'QA-Hit': 100 * np.mean(bins)
+    }
+def compute_claims(data):
+    global autoais_model, autoais_tokenizer
+    if autoais_model is None:
+        #logger.info("Loading AutoAIS model...")
+        # autoais_model = AutoModelForSeq2SeqLM.from_pretrained(AUTOAIS_MODEL, torch_dtype=torch.bfloat16, max_memory=get_max_memory(), device_map="auto")
+        autoais_model = AutoModelForSeq2SeqLM.from_pretrained(AUTOAIS_MODEL, torch_dtype=torch.bfloat16,
+                                                              device_map=evaluate_device)
+        # autoais_model = AutoModelForSeq2SeqLM.from_pretrained(AUTOAIS_MODEL, torch_dtype=torch.bfloat16, max_memory=get_max_memory(), device_map="auto",offload_folder= "/data/hongbang/zsf/projects/ALCE/ALCE/model/t5_xxl_true_nli_mixture/offload1")
+        autoais_tokenizer = AutoTokenizer.from_pretrained(AUTOAIS_MODEL, use_fast=False)
+    #logger.info("Computing claims...")
+    scores = []
+    for item in tqdm(data):
+        normalized_output = remove_citations(item['output'])
+        entail = 0
+        claims = item["qa_pairs"]
+        for claim in claims:
+            entail += _run_nli_autoais(normalized_output, claim)
+        scores.append(entail / len(claims))
+    return 100 * np.mean(scores)
+def compute_qampari_f1(data, cot=False):
+    prec = []
+    rec = []
+    rec_top5 = []
+    f1 = []
+    f1_top5 = []
+    num_preds = []
+    for item in data:
+        if cot:
+            if ":" in item['output']:
+                o = ':'.join(item['output'].split(":")[1:])  # try to separate the COT part and the answer list part.
+            else:
+                o = ""
+        else:
+            o = item['output']
+        preds = [normalize_answer(x.strip()) for x in remove_citations(o).rstrip().rstrip(".").rstrip(",").split(",")]
+        preds = [p for p in preds if len(p) > 0]  # delete empty answers
+        num_preds.append(len(preds))
+        answers = [[normalize_answer(x) for x in ans] for ans in item['answer']]
+        flat_answers = [item for sublist in answers for item in sublist]
+        prec.append(sum([p in flat_answers for p in preds]) / len(preds) if len(preds) > 0 else 0)
+        rec.append(sum([any([x in preds for x in a]) for a in answers]) / len(answers))
+        rec_top5.append(min(5, sum([any([x in preds for x in a]) for a in answers])) / min(5, len(answers)))
+        if (prec[-1] + rec[-1]) == 0:
+            f1.append(0)
+        else:
+            f1.append(2 * prec[-1] * rec[-1] / (prec[-1] + rec[-1]))
+        if (prec[-1] + rec_top5[-1]) == 0:
+            f1_top5.append(0)
+        else:
+            f1_top5.append(2 * prec[-1] * rec_top5[-1] / (prec[-1] + rec_top5[-1]))
+    return {
+        "num_preds": np.mean(num_preds),
+        "qampari_prec": 100 * np.mean(prec),
+        "qampari_rec": 100 * np.mean(rec),
+        "qampari_rec_top5": 100 * np.mean(rec_top5),
+        "qampari_f1": 100 * np.mean(f1),
+        "qampari_f1_top5": 100 * np.mean(f1_top5),
+    }
+def compute_str_em(data):
+    """Compute STR-EM metric (only for ASQA)
+    Args:
+        data: requires field `qa_pairs/short_answers` and `output`
+    Returns:
+        STR-EM and STR-EM-HIT ()
+    """
+    if 'qa_pairs' not in data[0] or data[0]['qa_pairs'] is None:
+        return 0
+    acc = []
+    for item in data:
+        loc_acc = []
+        if len(item['qa_pairs']) == 0:
+            continue
+        loc_acc.append(exact_presence(item['qa_pairs'][0]['short_answers'], item["output"]))
+        """for qa_pair in item['qa_pairs']:
+            loc_acc.append(exact_presence(qa_pair['short_answers'], item["output"]))"""
+        acc.append(float(np.mean(loc_acc)))
+    return 100 * np.mean(acc) if len(acc) > 0 else 0
+def compute_mauve(data):
+    """Compute Mauve score."""
+    logging.info("Computing MAUVE...")
+    human_data = []
+    model_data = []
+    for item in data:
+        # Remove ending punctuations
+        # Remove any new lines
+        # Truncate by 100 words
+        human_data.append(
+            ' '.join((item['query'] + " " + item['answer'].strip()).split()[:100]).rstrip(string.punctuation))
+        model_data.append(
+            ' '.join((item['query'] + " " + item['output'].strip()).split()[:100]).rstrip(string.punctuation))
+    import mauve
+    out = mauve.compute_mauve(
+        p_text=human_data,
+        q_text=model_data,
+        device_id=0,
+        max_text_length=512,
+        verbose=True,
+        batch_size=8,
+        featurize_model_name="gpt2-large"
+    )
+    return out.mauve * 100
+def compute_rouge_l(data):
+    total = len(data)
+    res = {
+                "r": 0.0,
+                "p": 0.0,
+                "f": 0.0
+            }
+    for item in data:
+        # print(f"output:{item['output']}, \nanswer:{item['answer']}")
+        if item['output'] and item['answer']:
+            rouge = Rouge()
+            scores = rouge.get_scores(item['output'], item['answer'])
+            res['r'] += scores[0]['rouge-l']['r']
+            res['p'] += scores[0]['rouge-l']['p']
+            res['f'] += scores[0]['rouge-l']['f']
+        else:
+            print('Warning: no hypothesis or references')
+    res['r'] /= total
+    res['p'] /= total
+    res['f'] /= total
+    return res
+def compute_length(data):
+    return sum(len(item['output'].split(' '))for item in data)/(len(data))
+metric_list = {
+    'cite_pr': compute_autoais,
+    'asqa_acc': compute_qa,
+    'eli5_acc': compute_claims,
+    'qampari': compute_qampari_f1,
+    'short_ans': compute_str_em,
+    # 'fluence': compute_mauve,
+    'rouge': compute_rouge_l,
+    'length': compute_length,
+    'rouge_all': score_all_rouge,
+    'semqa_f1': score_semqa_f1, # 相当于precision
+    'semqa_short': score_semqa_short_recall, # 相当于recall
+}
+data_list = {
+    'cite_pr': {'output': None, 'docs': None, 'query': None},
+    'asqa_acc': {'output': None,'qa_pairs': None, 'query': None},
+    'eli5_acc': {'output': None, 'qa_pairs': None},
+    'qampari': {'output': None, 'answer': None},
+    'short_ans': {'qa_pairs': None, 'output': None},
+    # 'fluence': {'query': None, 'answer': None, 'output': None},
+    'rouge': {'output': None, 'answer': None},
+    'length': {'output': None},
+    'rouge_all': {'answer': None, 'output': None},
+    'semqa_f1': {'answer': None, 'output': None, 'docs': None},
+    'semqa_short':{'output': None, 'qa_pairs': None},
+    'semqa': {}
+}
+class AttributeMetric:
+    def __init__(self, config):
+        self.task = 'attribute'
+        self.metrics = config['metric']
+        self.flag = False
+        self.data = {
+            'cite_pr': [],
+            'asqa_acc': [],
+            'eli5_acc': [],
+            'qampari': [],
+            'short_ans': [],
+            'fluence': [],
+            'rouge': [],
+            'length': [],
+            'rouge_all': [],
+            'semqa_f1': [],
+            'semqa_short': [],
+            'semqa': [],
+        }
+    def add_batch(self, data): #(output, qa_pairs, answer, docs, query)
+        for metric in self.metrics:
+            self.data[metric].append({k:v for k, v in data.items() if k in data_list[metric]})
+    def compute(self):
+        ans = {}
+        for metric in self.metrics:
+            assert metric in metric_list, logging.info("Invalid metric")
+            if metric == 'cite_pr' and 'qampari' in self.metrics:
+                ans[metric] = metric_list[metric](data = self.data[metric], qampari = True)
+            else:
+                ans[metric] = metric_list[metric](data = self.data[metric])
+            #if metric == 'semqa':
+            #   self.flag = True
+            #else:
+            #    ans[metric] = metric_list[metric](data = self.data[metric], qampari = True if 'qampari' in self.metrics else False)
+            #if metric == 'rouge_all':
+            #   ans[metric] = ans[metric][0]['rougeLsum'][0]
+        #if self.flag:
+        #   ans['semqa'] = np.sqrt(ans['rouge_all'] * ans['semqa_f1'])
+        return ans
+class AutoMetric(BasicMetric):
+    def __init__(self, task_name: str, config: Optional[List]) -> None:
+        super().__init__()
+        path_prefix = os.getenv("MOE_PEFT_METRIC_PATH")
+        if path_prefix is None:
+            path_prefix = ""
+        elif not path_prefix.endswith(os.sep):
+            path_prefix += os.sep
+        if task_name == "attribute":
+            self.metric_ = AttributeMetric(config)
+        elif ":" in task_name:
+            split = task_name.split(":")
+            self.metric_ = hf_evaluate.load(path_prefix + split[0], split[1])
+        else:
+            self.metric_ = hf_evaluate.load(path_prefix + task_name)
+    def add_batch(self, predictions: torch.Tensor, references: torch.Tensor):
+        self.metric_.add_batch(predictions=predictions, references=references)
+    def compute(self) -> Dict[str, Any]:
+        return self.metric_.compute()
+class BasicTask:
+    def __init__(self) -> None:
+        pass
+    @property
+    def peft_task_type(self) -> str:
+        pass
+    def loading_data(
+        self, is_train: bool = True, path: Optional[str] = None
+    ) -> List[InputData]:
+        pass
+    def loading_metric(self) -> BasicMetric:
+        pass
+    def init_kwargs(self) -> Dict:
+        return {}
+# Casual Fine-tuning Tasks
+# Instant-Created Class
+class CasualTask(BasicTask):
+    @property
+    def peft_task_type(self) -> str:
+        return "CAUSAL_LM"
+    def loading_data(
+        self, is_train: bool = True, path: Optional[str] = None
+    ) -> List[InputData]:
+        assert path is not None, "Casual supervised fine-tuning requires data path."
+        assert is_train, "Casual supervised fine-tuning task only supports training."
+        # Loading dataset
+        if path.endswith(".json") or path.endswith(".jsonl"):
+            data = hf_datasets.load_dataset("json", data_files=path)
+        elif ":" in path:
+            split = path.split(":")
+            data = hf_datasets.load_dataset(split[0], split[1])
+        else:
+            data = hf_datasets.load_dataset(path)
+        ret: List[InputData] = []
+        for data_point in data["train"]:
+            ret.append(
+                InputData(
+                    inputs=Prompt(
+                        instruction=data_point["instruction"],
+                        input=data_point.get("input", None),
+                        label=data_point.get("output", None),
+                    )
+                )
+            )
+        return ret
+# Sequence Classification
+class SequenceClassificationTask(BasicTask):
+    def __init__(
+        self,
+        task_name: str,
+        task_type: str,
+        label_dtype: torch.dtype,
+        num_labels: int,
+        dataload_function: Callable,
+        # Setting to `None` corresponds to the task name.
+        metric_name: Optional[str] = None,
+        # The default values are "train" and "validation".
+        subset_map: Optional[Tuple[str, str]] = ("train", "validation"),
+    ) -> None:
+        super().__init__()
+        self.task_name_ = task_name
+        self.task_type_ = task_type
+        self.label_dtype_ = label_dtype
+        self.num_labels_ = num_labels
+        self.dataload_function_ = dataload_function
+        if metric_name is None:
+            self.metric_name_ = task_name
+        else:
+            self.metric_name_ = metric_name
+        self.subset_map_ = subset_map
+    @property
+    def peft_task_type(self) -> str:
+        return "SEQ_CLS"
+    def loading_data(
+        self, is_train: bool = True, path: Optional[str] = None
+    ) -> List[InputData]:
+        if ":" in self.task_name_:
+            split = self.task_name_.split(":")
+            data = hf_datasets.load_dataset(
+                split[0] if path is None else path, split[1]
+            )
+        else:
+            data = hf_datasets.load_dataset(self.task_name_ if path is None else path)
+        data = data[self.subset_map_[0] if is_train else self.subset_map_[1]]
+        logging.info(f"Preparing data for {self.task_name_.upper()}")
+        ret: List[InputData] = []
+        for data_point in data:
+            inputs, labels = self.dataload_function_(data_point)
+            assert isinstance(labels, List)
+            ret.append(InputData(inputs=inputs, labels=labels))
+        return ret
+    def loading_metric(self) -> BasicMetric:
+        return AutoMetric(self.metric_name_)
+    def init_kwargs(self) -> Dict:
+        return {
+            "task_type": self.task_type_,
+            "num_labels": self.num_labels_,
+            "label_dtype": self.label_dtype_,
+        }
+# Common Sense
+class CommonSenseTask(BasicTask):
+    def __init__(self) -> None:
+        super().__init__()
+        self.task_type_ = "common_sense"
+        self.label_dtype_ = None
+    @property
+    def peft_task_type(self) -> str:
+        return "QUESTION_ANS"
+    def label_list(self) -> List[str]:
+        pass
+class AttributeTask(BasicTask):
+    def __init__(self) -> None:
+        super().__init__()
+        self.task_type_ = "attribute"
+        self.label_dtype_ = None
+    @property
+    def peft_task_type(self) -> str:
+        return "ATTRIBUTE"
+task_dict = {}
+# Multi-Task (Only for train)
+class MultiTask(BasicTask):
+    def __init__(self, task_names: str) -> None:
+        super().__init__()
+        self.task_type_ = "multi_task"
+        self.label_dtype_ = None
+        self.task_list_: List[BasicTask] = []
+        task_names = task_names.split(";")
+        for name in task_names:
+            self.task_list_.append(task_dict[name])
+    def loading_data(
+        self, is_train: bool = True, path: Optional[str] = None
+    ) -> List[InputData]:
+        logging.info(f"Preparing data for {len(self.task_list_)} tasks")
+        path_list = None if path is None else path.split(";")
+        data: List[InputData] = []
+        assert is_train
+        for idx, task in enumerate(self.task_list_):
+            path: str = "" if path_list is None else path_list[idx].strip()
+            data.extend(task.loading_data(is_train, None if len(path) == 0 else path))
+        return data
+def main():
+    """source = '/yy21/MoE-PEFT/dataset/APO/preference_data.jsonl'
+    data = []
+    with open(source, 'r') as f:
+        for line in f:
+            y = json.loads(line)
+            output = ""
+            for s in y['statements']:
+                if isinstance(s, List):
+                    for i in s:
+                        output += i + " "
+                else:
+                    dot = s['statement'].strip()[-1]
+                    output += s['statement'].strip()[:-1]
+                    if 'revised_used_document' in s:
+                        for i in s['revised_used_document']:
+                            output += '[' + i + ']'
+                    else:
+                        if len(s['used_document']) != 0:
+                            for i in s['used_document']:
+                                output += '[' + i + ']'
+                    output += dot + ' '
+            docs = [d['text'] for d in y['documents']]
+            fk = {
+               'query': y['query'],
+               'output': output,
+               'docs': docs,
+            }
+    ans = compute_autoais(fk)
+    print(ans)"""
+    def split_docs_and_answer(input_str):
+        if "[ANSWER]" not in input_str:
+            return ""
+        index = input_str.find("[ANSWER]")
+        ans = input_str[index + len("[ANSWER]"):][:-4].strip()
+        return ans
+    test_data = []
+    with open('/yy21/test_qamp_v2.jsonl', "r", encoding="utf-8") as fuck:
+        with open('/yy21/MoE-PEFT/dataset/front_output/qampari.json', "r", encoding="utf-8") as f:
+            data = json.load(f)
+            for idx, line in enumerate(fuck):
+                opt = json.loads(line)
+                ori_output = re.sub(r'\[ref_(\d+)\]', r'[\1]', opt['response'])
+                #qa_pairs = data[idx]['qa_pairs']
+                answer = data[idx]['answer']
+                query = data[idx]['question']
+                output = split_docs_and_answer(ori_output)
+                ori_docs = []
+                for i in range(5):
+                    ori_docs.append(data[idx]['docs'][i]['text'])
+                fk = {
+                    #'qa_pairs' : qa_pairs,
+                    'answer' : answer,
+                    'query' : query,
+                    'docs' : ori_docs,
+                    'output' : ori_output
+                }
+                test_data.append(fk)
+            ans = compute_autoais(test_data, qampari=True)
+            print(ans)
+    """with open('/yy21/test_eli5_output0.jsonl', "r", encoding="utf-8") as fuck,\
+        open('/yy21/test_eli5_output.jsonl', "w", encoding="utf-8") as outputf:
+            for idx, line in enumerate(fuck):
+                opt = json.loads(line)
+                opt['accuracy'] = acc[idx]
+                outputf.write(json.dumps(opt, ensure_ascii=False) + '\n')"""
+"""    with open('/yy21/MoE-PEFT/dataset/front_output/eli5.json', "r", encoding="utf-8") as f:
+        data = json.load(f)
+        test_data = []
+        for data_point in data:
+            ori_output = data_point['output']
+            qa_pairs = data_point['claims']
+            answer = data_point['answer']
+            query = data_point['question']
+            output = split_docs_and_answer(ori_output)
+            ori_docs = []
+            for i in range(5):
+               ori_docs.append(data_point['docs'][i]['text'])
+            fk = {
+                'qa_pairs' : qa_pairs,
+                'answer' : answer,
+                'query' : query,
+                'docs' : ori_docs,
+                'output' : output
+            }
+            test_data.append(fk)
+        ans = compute_claims(test_data)
+        print(ans)"""
+if __name__ == "__main__":
+    main()

c2cite/tasks/glue_tasks.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch
+from .common import SequenceClassificationTask
+def update_task_dict(task_dict):
+    task_dict.update(
+        {
+            "glue:cola": SequenceClassificationTask(
+                task_name="glue:cola",
+                task_type="single_label_classification",
+                num_labels=2,
+                label_dtype=torch.long,
+                dataload_function=lambda data_point: (
+                    [data_point["sentence"]],
+                    [int(data_point["label"])],
+                ),
+            ),
+            "glue:mnli": SequenceClassificationTask(
+                task_name="glue:mnli",
+                task_type="single_label_classification",
+                num_labels=3,
+                label_dtype=torch.long,
+                dataload_function=lambda data_point: (
+                    [data_point["premise"], data_point["hypothesis"]],
+                    [int(data_point["label"])],
+                ),
+            ),
+            "glue:mrpc": SequenceClassificationTask(
+                task_name="glue:mrpc",
+                task_type="single_label_classification",
+                num_labels=2,
+                label_dtype=torch.long,
+                dataload_function=lambda data_point: (
+                    [data_point["sentence1"], data_point["sentence2"]],
+                    [int(data_point["label"])],
+                ),
+            ),
+            "glue:qnli": SequenceClassificationTask(
+                task_name="glue:qnli",
+                task_type="single_label_classification",
+                num_labels=2,
+                label_dtype=torch.long,
+                dataload_function=lambda data_point: (
+                    [data_point["question"], data_point["sentence"]],
+                    [int(data_point["label"])],
+                ),
+            ),
+            "glue:qqp": SequenceClassificationTask(
+                task_name="glue:qqp",
+                task_type="single_label_classification",
+                num_labels=2,
+                label_dtype=torch.long,
+                dataload_function=lambda data_point: (
+                    [data_point["question1"], data_point["question2"]],
+                    [int(data_point["label"])],
+                ),
+            ),
+            "glue:rte": SequenceClassificationTask(
+                task_name="glue:rte",
+                task_type="single_label_classification",
+                num_labels=2,
+                label_dtype=torch.long,
+                dataload_function=lambda data_point: (
+                    [data_point["sentence1"], data_point["sentence2"]],
+                    [int(data_point["label"])],
+                ),
+            ),
+            "glue:sst2": SequenceClassificationTask(
+                task_name="glue:sst2",
+                task_type="single_label_classification",
+                num_labels=2,
+                label_dtype=torch.long,
+                dataload_function=lambda data_point: (
+                    [data_point["sentence"]],
+                    [int(data_point["label"])],
+                ),
+            ),
+            "glue:wnli": SequenceClassificationTask(
+                task_name="glue:wnli",
+                task_type="single_label_classification",
+                num_labels=2,
+                label_dtype=torch.long,
+                dataload_function=lambda data_point: (
+                    [data_point["sentence1"] + " </s> " + data_point["sentence2"]],
+                    [int(data_point["label"])],
+                ),
+            ),
+        }
+    )